In [89]:
import pandas as pd

# What is Pandas ?
Pandas is a Python library used for working with data sets. It has functions for analyzing, cleaning, exploring, and manipulating data. The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008.

"panel data", an econometrics term for data sets that include observations over multiple time periods for the same individuals.


Pandas will extract the data from that CSV into a DataFrame — a table, basically — then let you do things like:
* Calculate statistics and answer questions about the data: -What's the average, median, max, or min of each column? -Does column A correlate with column B? -What does the distribution of data in column C look like?
* Clean the data by doing things like removing missing values and filtering rows or columns by some criteria
* Store the cleaned, transformed data back into a CSV, other file or database


# Why Use Pandas?
Pandas allows us to analyze big data and make conclusions based on statistical theories. Pandas can clean messy data sets, and make them readable and relevant. Relevant data is very important in data science.

# What Can Pandas Do?

* Fast and efficient DataFrame object with default and customized indexing.
* Tools for loading data into in-memory data objects from different file formats.
* Data alignment and integrated handling of missing data.
* Reshaping and pivoting of date sets.
* Label-based slicing, indexing and subsetting of large data sets.
* Columns from a data structure can be deleted or inserted.
* Group by data for aggregation and transformations.
* High performance merging and joining of data.

## Pandas Series
A Pandas Series is like a column in a table. It is a one-dimensional array holding data of any type.
If nothing else is specified, the values are labeled with their index number. First value has index 0.

In [90]:
#Pandas Series from a list:
a = [1, 7, 2]
print( pd.Series(a))

0    1
1    7
2    2
dtype: int64


In [91]:
# Indexing
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)
print("---------")
print(myvar["y"])

x    1
y    7
z    2
dtype: int64
---------
7


In [92]:
seri = pd.Series([10,88,3,4,5,22,14,125,45,52])
print(seri.axes)
print("---------")
print(seri.head(3))
print("---------")
print(seri.tail(3))

[RangeIndex(start=0, stop=10, step=1)]
---------
0    10
1    88
2     3
dtype: int64
---------
7    125
8     45
9     52
dtype: int64


In [93]:
#Key/Value Objects as Series
#You can also use a key/value object, like a dictionary, when creating a Series.
dict1 = {  
 "brand": "Ford",  
 "model": "Mustang",
 "electric": False,  
 "engine_type": "V-8",
 "100km/h": 8.2,
 "year": 1964,  
 "colors": ["red", "white", "black"]  
}
seri = pd.Series(dict1)
print(seri)

brand                         Ford
model                      Mustang
electric                     False
engine_type                    V-8
100km/h                        8.2
year                          1964
colors         [red, white, black]
dtype: object


In [94]:
seri = pd.Series([1962,1997,1999,2001], index = ["Michael","John","Muhammed","Mike"])
print(seri)
print("---------")
print(seri.index)
print("---------")
print(seri.keys)
print("---------")
print(seri.values)
print("---------")
print(list(seri.items()))
print("---------")
print("John" in seri)
print("---------")
print(seri[["John", "Mike"]])
print("---------")
print(seri["John"] == 1997)

Michael     1962
John        1997
Muhammed    1999
Mike        2001
dtype: int64
---------
Index(['Michael', 'John', 'Muhammed', 'Mike'], dtype='object')
---------
<bound method Series.keys of Michael     1962
John        1997
Muhammed    1999
Mike        2001
dtype: int64>
---------
[1962 1997 1999 2001]
---------
[('Michael', 1962), ('John', 1997), ('Muhammed', 1999), ('Mike', 2001)]
---------
True
---------
John    1997
Mike    2001
dtype: int64
---------
True


# Pandas DataFrame
A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [95]:
store = {
  "Laptop_Brands": ["MSI", "Acer", "Apple", "Dell", "Casper", "Lenovo", "Asus", "LG", "Alienware"],
  "Prices(€)": [1500, 1000, 2500, 1600, 550, 1800, 900, 600, 2400],
  "Colors": ["Black", "Red", "Gray", "Blue", "Gray", "Black", "White", "White", "Green"],
  "Weight(gr)": [1900,2400,1400,1600,2700,2800,1900,1700,2500]
}
#load data into a DataFrame object:
laptop_features = pd.DataFrame(store)
print(laptop_features) 

  Laptop_Brands  Prices(€) Colors  Weight(gr)
0           MSI       1500  Black        1900
1          Acer       1000    Red        2400
2         Apple       2500   Gray        1400
3          Dell       1600   Blue        1600
4        Casper        550   Gray        2700
5        Lenovo       1800  Black        2800
6          Asus        900  White        1900
7            LG        600  White        1700
8     Alienware       2400  Green        2500


In [96]:
#Return row 0:
print(laptop_features.loc[0])

Laptop_Brands      MSI
Prices(€)         1500
Colors           Black
Weight(gr)        1900
Name: 0, dtype: object


In [97]:
#use a list of indexes:
print(laptop_features.loc[[0, 1]])

  Laptop_Brands  Prices(€) Colors  Weight(gr)
0           MSI       1500  Black        1900
1          Acer       1000    Red        2400


In [98]:
#Add a list of names to give each row a name:
person1 = {
  "calories": [420, 380, 390, 430, 440, 395, 420],
  "duration": [50, 40, 45, 60, 40, 45, 50]
}

sport_data = pd.DataFrame(person1, index = ["Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6", "Day 7"])
print(sport_data) 
print("-----------------")
print(sport_data.loc["Day 2"])
print("-----------------")

       calories  duration
Day 1       420        50
Day 2       380        40
Day 3       390        45
Day 4       430        60
Day 5       440        40
Day 6       395        45
Day 7       420        50
-----------------
calories    380
duration     40
Name: Day 2, dtype: int64
-----------------


In [99]:
## Add new columns
sport_data["Activity"] = ["Runnig","Walking","Gym","Ice skating", "Football", "Basketball", "Swimming"]
print(sport_data) 


       calories  duration     Activity
Day 1       420        50       Runnig
Day 2       380        40      Walking
Day 3       390        45          Gym
Day 4       430        60  Ice skating
Day 5       440        40     Football
Day 6       395        45   Basketball
Day 7       420        50     Swimming


In [100]:
# Load a comma separated file (CSV file) into a DataFrame:
netflix_data = pd.read_csv('netflix.csv')
netflix_data.head()


Unnamed: 0,Country,Total Library Size,No. of TV Shows,No. of Movies,Cost Per Month - Basic ($),Cost Per Month - Standard ($),Cost Per Month - Premium ($)
0,Argentina,4760,3154,1606,3.74,6.3,9.26
1,Austria,5640,3779,1861,9.03,14.67,20.32
2,Bolivia,4991,3155,1836,7.99,10.99,13.99
3,Bulgaria,6797,4819,1978,9.03,11.29,13.54
4,Chile,4994,3156,1838,7.07,9.91,12.74


In [101]:
covid_variants = pd.read_csv('covid_variants.csv')
covid_variants.head()

Unnamed: 0,location,date,variant,num_sequences,perc_sequences,num_sequences_total
0,Angola,2020-07-06,Alpha,0,0.0,3
1,Angola,2020-07-06,B.1.1.277,0,0.0,3
2,Angola,2020-07-06,B.1.1.302,0,0.0,3
3,Angola,2020-07-06,B.1.1.519,0,0.0,3
4,Angola,2020-07-06,B.1.160,0,0.0,3


In [102]:
googleplaystore = pd.read_csv('googleplaystore.csv')
googleplaystore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


# Read JSON
Big data sets are often stored, or extracted as JSON. JSON is plain text, but has the format of an object, and is well known in the world of programming, including Pandas. JSON objects have the same format as Python dictionaries.

In [103]:
df = pd.read_json('iris.json')
#Load the JSON file into a DataFrame:
print(df.to_string()) 

     sepalLength  sepalWidth  petalLength  petalWidth     species
0            5.1         3.5          1.4         0.2      setosa
1            4.9         3.0          1.4         0.2      setosa
2            4.7         3.2          1.3         0.2      setosa
3            4.6         3.1          1.5         0.2      setosa
4            5.0         3.6          1.4         0.2      setosa
5            5.4         3.9          1.7         0.4      setosa
6            4.6         3.4          1.4         0.3      setosa
7            5.0         3.4          1.5         0.2      setosa
8            4.4         2.9          1.4         0.2      setosa
9            4.9         3.1          1.5         0.1      setosa
10           5.4         3.7          1.5         0.2      setosa
11           4.8         3.4          1.6         0.2      setosa
12           4.8         3.0          1.4         0.1      setosa
13           4.3         3.0          1.1         0.1      setosa
14        

# loc & iloc
These are used in slicing of data from the Pandas DataFrame. They help in the convenient selection of data from the DataFrame. They are used in filtering the data according to some conditions.

loc() is label based data selecting method which means that we have to pass the name of the row or column which we want to select. This method includes the last element of the range passed in it.

iloc() is a indexed based selecting method which means that we have to pass integer index in the method to select specific row/column.

In [104]:
cars = pd.DataFrame({'Brand' : ['Ferrari', 'Mercedes', 'BMW',
                                'Alfa Romeo', 'Fiat', 'Hyundai',
                                'Renault', 'Toyota', 'Mustang'],
                     'Year' : [2020, 2022, 2014, 2015, 2013, 
                               2016, 2014, 2018, 2019],
                     'KmsDriven' : [50000, 30000, 60000, 
                                     25000, 10000, 46000, 
                                     31000, 15000, 12000],
                     'Country' : ['Italy', 'Germany', 'Germany', 
                               'Italy', 'Italy', 'Japan', 
                               'France','Japan',  'USA'],
                     })
   
print(cars)

        Brand  Year  KmsDriven  Country
0     Ferrari  2020      50000    Italy
1    Mercedes  2022      30000  Germany
2         BMW  2014      60000  Germany
3  Alfa Romeo  2015      25000    Italy
4        Fiat  2013      10000    Italy
5     Hyundai  2016      46000    Japan
6     Renault  2014      31000   France
7      Toyota  2018      15000    Japan
8     Mustang  2019      12000      USA


In [105]:
print(cars.loc[(cars.Country == 'Italy') & (cars.Year > 2012)])
# Selecting data according to some conditions

        Brand  Year  KmsDriven Country
0     Ferrari  2020      50000   Italy
3  Alfa Romeo  2015      25000   Italy
4        Fiat  2013      10000   Italy


In [106]:
print(cars.loc[2 : 5])
# Selecting a range of rows from the DataFrame

        Brand  Year  KmsDriven  Country
2         BMW  2014      60000  Germany
3  Alfa Romeo  2015      25000    Italy
4        Fiat  2013      10000    Italy
5     Hyundai  2016      46000    Japan


In [107]:
# updating values 
cars.loc[(cars.Year > 2021), ['KmsDriven']] = 1000
print(cars)

        Brand  Year  KmsDriven  Country
0     Ferrari  2020      50000    Italy
1    Mercedes  2022       1000  Germany
2         BMW  2014      60000  Germany
3  Alfa Romeo  2015      25000    Italy
4        Fiat  2013      10000    Italy
5     Hyundai  2016      46000    Japan
6     Renault  2014      31000   France
7      Toyota  2018      15000    Japan
8     Mustang  2019      12000      USA


In [108]:
# Selecting rows using integer indices
print(cars.iloc[[0, 2, 4, 7]])

     Brand  Year  KmsDriven  Country
0  Ferrari  2020      50000    Italy
2      BMW  2014      60000  Germany
4     Fiat  2013      10000    Italy
7   Toyota  2018      15000    Japan


#  Concatenate Two or More Pandas DataFrames
A concatenation of two or more data frames can be done using pandas.concat() method. concat() in pandas works by combining Data Frames across rows or columns. We can concat two or more data frames either along rows  (axis=0) or along columns (axis=1)

In [109]:
company1 = {'Name':['Raphael', 'Luca', 'Alessio', 'Elisa', 'John', 'Riccardo', 'Carlo', 'Michael', 'Axel'],
        'Age':[27, 24, 22, 32, 23, 25, 34, 42, 35],
        'Country':['Germany', 'Italy', 'Italy', 'Italy', 'USA', 'Italy', 'Italy', 'USA', 'Germany'],
        'Qualification':['MSc', 'MA', 'MCA', 'PhD', 'MSc', 'MSc', 'MSc', 'MSc', 'MSc']}
company1 = pd.DataFrame(company1)
print(company1)

       Name  Age  Country Qualification
0   Raphael   27  Germany           MSc
1      Luca   24    Italy            MA
2   Alessio   22    Italy           MCA
3     Elisa   32    Italy           PhD
4      John   23      USA           MSc
5  Riccardo   25    Italy           MSc
6     Carlo   34    Italy           MSc
7   Michael   42      USA           MSc
8      Axel   35  Germany           MSc


In [110]:
company2 = {'Name':['Hulk', 'Ronaldo', 'Hans', 'Ayse', 'Peter', 'Xavi', 'Nani', 'Andreas', 'Patrick'],
        'Age':[32, 40, 34, 42, 35, 24, 22, 32, 23],
        'Country':['Brazil', 'Brazil', 'Germany', 'Turkey', 'UK', 'Spain', 'Portugal', 'Austria', 'Norway'],
        'Qualification':['PhD', 'MA', 'MCA', 'PhD', 'MSc', 'MSc', 'MSc', 'MSc', 'MSc']}
company2 = pd.DataFrame(company2)
print(company2)

      Name  Age   Country Qualification
0     Hulk   32    Brazil           PhD
1  Ronaldo   40    Brazil            MA
2     Hans   34   Germany           MCA
3     Ayse   42    Turkey           PhD
4    Peter   35        UK           MSc
5     Xavi   24     Spain           MSc
6     Nani   22  Portugal           MSc
7  Andreas   32   Austria           MSc
8  Patrick   23    Norway           MSc


In [111]:
# concatenating company1 and company2 along columns 
pd.concat([company1, company2], axis=1)

Unnamed: 0,Name,Age,Country,Qualification,Name.1,Age.1,Country.1,Qualification.1
0,Raphael,27,Germany,MSc,Hulk,32,Brazil,PhD
1,Luca,24,Italy,MA,Ronaldo,40,Brazil,MA
2,Alessio,22,Italy,MCA,Hans,34,Germany,MCA
3,Elisa,32,Italy,PhD,Ayse,42,Turkey,PhD
4,John,23,USA,MSc,Peter,35,UK,MSc
5,Riccardo,25,Italy,MSc,Xavi,24,Spain,MSc
6,Carlo,34,Italy,MSc,Nani,22,Portugal,MSc
7,Michael,42,USA,MSc,Andreas,32,Austria,MSc
8,Axel,35,Germany,MSc,Patrick,23,Norway,MSc


In [112]:
# concatenating company1 and company2 along rows
pd.concat([company1, company2], axis=0,  ignore_index=True)

Unnamed: 0,Name,Age,Country,Qualification
0,Raphael,27,Germany,MSc
1,Luca,24,Italy,MA
2,Alessio,22,Italy,MCA
3,Elisa,32,Italy,PhD
4,John,23,USA,MSc
5,Riccardo,25,Italy,MSc
6,Carlo,34,Italy,MSc
7,Michael,42,USA,MSc
8,Axel,35,Germany,MSc
9,Hulk,32,Brazil,PhD


In [113]:
pd.concat([company1, company2], keys=['Company in Paris','Company in London'])

Unnamed: 0,Unnamed: 1,Name,Age,Country,Qualification
Company in Paris,0,Raphael,27,Germany,MSc
Company in Paris,1,Luca,24,Italy,MA
Company in Paris,2,Alessio,22,Italy,MCA
Company in Paris,3,Elisa,32,Italy,PhD
Company in Paris,4,John,23,USA,MSc
Company in Paris,5,Riccardo,25,Italy,MSc
Company in Paris,6,Carlo,34,Italy,MSc
Company in Paris,7,Michael,42,USA,MSc
Company in Paris,8,Axel,35,Germany,MSc
Company in London,0,Hulk,32,Brazil,PhD


In [114]:
class1 = pd.DataFrame({
    'name': ['Alex', 'Bob', 'Cindra', 'Davion'],
    'machine_learning': [90,89,82,70],
    'network_science': [86,95,83,66],
    'game_theory': [41,91,55,66],
    'internet': [71,41,0,5],
    'high_level_programming': [11,91,37,20],
    '5G': [21,51,37,10],
    'thesis': [81,21,87,40]
})
class2 = pd.DataFrame({
    'name': ['Edward', 'Luis', 'Alba', 'Can'],
    'machine_learning': [100,89,82,70],
    'network_science': [46,95,83,66],
    'game_theory': [100,91,88,66],
    'internet': [20,22,33,5],
    'high_level_programming': [55,66,37,20],
    '5G': [77,51,55,10],
    'thesis': [44,21,87,40]
})
pd.concat([class1, class2], sort=True)


Unnamed: 0,5G,game_theory,high_level_programming,internet,machine_learning,name,network_science,thesis
0,21,41,11,71,90,Alex,86,81
1,51,91,91,41,89,Bob,95,21
2,37,55,37,0,82,Cindra,83,87
3,10,66,20,5,70,Davion,66,40
0,77,100,55,20,100,Edward,46,44
1,51,91,66,22,89,Luis,95,21
2,55,88,37,33,82,Alba,83,87
3,10,66,20,5,70,Can,66,40


# Aggregation & Grouping

Aggregation in pandas provides various functions that perform a mathematical or logical operation on our dataset and returns a summary of that function. Aggregation can be used to get a summary of columns in our dataset like getting sum, minimum, maximum, etc. from a particular column of our dataset. The function used for aggregation is agg(), the parameter is the function we want to perform.

# Aggregation
* sum()         :Compute sum of column values
* min()          :Compute min of column values
* max()         :Compute max of column values
* mean()       :Compute mean of column
* size()          :Compute column sizes
* describe()  :Generates descriptive statistics
* first()          :Compute first of group values
* last()          :Compute last of group values
* count()       :Compute count of column values
* std()           :Standard deviation of column
* var()           :Compute variance of column
* sem()         :Standard error of the mean of column

In [115]:
class1.head()

Unnamed: 0,name,machine_learning,network_science,game_theory,internet,high_level_programming,5G,thesis
0,Alex,90,86,41,71,11,21,81
1,Bob,89,95,91,41,91,51,21
2,Cindra,82,83,55,0,37,37,87
3,Davion,70,66,66,5,20,10,40


In [116]:
class1.sum()

name                      AlexBobCindraDavion
machine_learning                          331
network_science                           330
game_theory                               253
internet                                  117
high_level_programming                    159
5G                                        119
thesis                                    229
dtype: object

In [117]:
class1.describe()

Unnamed: 0,machine_learning,network_science,game_theory,internet,high_level_programming,5G,thesis
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,82.75,82.5,63.25,29.25,39.75,29.75,57.25
std,9.215024,12.124356,21.140404,33.290389,35.827131,17.988422,31.941353
min,70.0,66.0,41.0,0.0,11.0,10.0,21.0
25%,79.0,78.75,51.5,3.75,17.75,18.25,35.25
50%,85.5,84.5,60.5,23.0,28.5,29.0,60.5
75%,89.25,88.25,72.25,48.5,50.5,40.5,82.5
max,90.0,95.0,91.0,71.0,91.0,51.0,87.0


In [118]:
class1.agg(['sum', 'min', 'max'])

Unnamed: 0,name,machine_learning,network_science,game_theory,internet,high_level_programming,5G,thesis
sum,AlexBobCindraDavion,331,330,253,117,159,119,229
min,Alex,70,66,41,0,11,10,21
max,Davion,90,95,91,71,91,51,87


# Grouping 
is used to group data using some criteria from our dataset. It is used as split-apply-combine strategy.
* Splitting the data into groups based on some criteria.
* Applying a function to each group independently.
* Combining the results into a data structure.

In [119]:
a = class1.groupby('machine_learning')
a.first()

Unnamed: 0_level_0,name,network_science,game_theory,internet,high_level_programming,5G,thesis
machine_learning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
70,Davion,66,66,5,20,10,40
82,Cindra,83,55,0,37,37,87
89,Bob,95,91,41,91,51,21
90,Alex,86,41,71,11,21,81


# Pandas - Cleaning Data
Data cleaning means fixing bad data in your data set.

Bad data could be:

* Empty cells
* Data in wrong format
* Wrong data
* Duplicates

# Exploratory Data Analysis or (EDA) 
is understanding the data sets by summarizing their main characteristics often plotting them visually.
* 1. Importing the required libraries for EDA(pandas,numpy,seaborn,matplotlib)
* 2. Loading the data into the data frame (just read the CSV into a data frame and pandas data frame does the job for us.)
* 3. Checking the types of data data.dtypes, convert string to the integer data
* 5. Renaming the columns
* 6. Dropping the duplicate rows
* 7. Dropping the missing or null values.



In [120]:
googleplaystore.tail()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device
10840,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,4.5,398307,19M,"10,000,000+",Free,0,Everyone,Lifestyle,"July 25, 2018",Varies with device,Varies with device


In [121]:
print(googleplaystore.info()) 
#The result tells us there are 10841 rows and 13 columns:
#And the name of each column, with the data type:
#info() method also tells us how many Non-Null values there are present in each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB
None


In [122]:
googleplaystore = googleplaystore.drop(labels= ['Type', 'Content Rating','Last Updated','Current Ver','Android Ver','Price'], axis=1)
googleplaystore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Art & Design
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Art & Design
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Art & Design
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Art & Design;Creativity


In [123]:
duplicate_rows_data = googleplaystore[googleplaystore.duplicated()]
print('number of duplicate rows: ', duplicate_rows_data.shape)

number of duplicate rows:  (485, 7)


In [124]:
googleplaystore.count()

App         10841
Category    10841
Rating       9367
Reviews     10841
Size        10841
Installs    10841
Genres      10841
dtype: int64

In [125]:
# Dropping the duplicate rows
googleplaystore = googleplaystore.drop_duplicates()
googleplaystore.count()

App         10356
Category    10356
Rating       8891
Reviews     10356
Size        10356
Installs    10356
Genres      10356
dtype: int64

In [126]:
# Dropping the missing or null values.
googleplaystore = googleplaystore.dropna() 
googleplaystore.count()

App         8891
Category    8891
Rating      8891
Reviews     8891
Size        8891
Installs    8891
Genres      8891
dtype: int64