In [1]:
import pandas as pd
import numpy as np

# 1. Working with Pandas Series

### a) Creating series

#### Series through list

In [2]:
lst = [1,2,3,4,5]

pd.Series(lst)

0    1
1    2
2    3
3    4
4    5
dtype: int64

#### Series through Numpy array


In [3]:
arr = np.array([1,2,3,4,5])
pd.Series(arr)

0    1
1    2
2    3
3    4
4    5
dtype: int64

#### Giving index from our own end

In [4]:
pd.Series(data = ['Jiya','Priya','Rohan'],index = [1,2,3])


1     Jiya
2    Priya
3    Rohan
dtype: object

#### Series through dictionary values

In [5]:
steps = {"day_1" : 4000 , "day_2" : 3000 , "day_3" : 12000}
pd.Series(steps)

day_1     4000
day_2     3000
day_3    12000
dtype: int64

#### Repeat function with creating series

In [6]:
pd.Series(5).repeat(7)

0    5
0    5
0    5
0    5
0    5
0    5
0    5
dtype: int64

#### reset fn is used to make index accurate

In [7]:
pd.Series(5).repeat(3).reset_index()

Unnamed: 0,index,0
0,0,5
1,0,5
2,0,5


#### below code will indicate that 
#### 10 should repeat 5 times and 20 should repeat 2 times

In [8]:
s = pd.Series([10,20]).repeat([5,2]).reset_index(drop = True)

s

0    10
1    10
2    10
3    10
4    10
5    20
6    20
dtype: int64

#### Accessing the element

In [9]:
s[0]

np.int64(10)

In [10]:
s[2:-2]

2    10
3    10
4    10
dtype: int64

### b) Aggregate function on Panndas Series

In [11]:
sr = pd.Series([1,2,3,4,5,6,7,8])

sr.agg([min,max,sum])

  sr.agg([min,max,sum])
  sr.agg([min,max,sum])
  sr.agg([min,max,sum])


min     1
max     8
sum    36
dtype: int64

### c) Series absolute function

In [12]:
sr = pd.Series([1,2,-3,4,-5,-6,7,8])

sr.abs()

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

### d) Appending Series

### Pandas Series.append() is used to concatenate two or more series - for older version. 
 #### New version = pd.concat()

In [13]:
sr1 = pd.Series([1,2,3,4,5,6,7,8])
sr2 = pd.Series([-1,-2,-3,4,-5,-6,-7,-8])

sr3 = pd.concat([sr2,sr1])

sr3

0   -1
1   -2
2   -3
3    4
4   -5
5   -6
6   -7
7   -8
0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

#### To make index accurate

In [14]:
sr3.reset_index(drop = True)

0    -1
1    -2
2    -3
3     4
4    -5
5    -6
6    -7
7    -8
8     1
9     2
10    3
11    4
12    5
13    6
14    7
15    8
dtype: int64

### e) Astype function

In [15]:
sr1 ## here we can seet that int64 is mentioned

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

In [16]:
type(sr1[0])

numpy.int64

In [17]:
sr1.astype(float)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    7.0
7    8.0
dtype: float64

#### f) Between Functions - method used on a series to check which values lie between first and second argument

In [18]:
sr1 = pd.Series([1,2,38,4,5,6,7,8,9,10])

sr1

0     1
1     2
2    38
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [19]:
sr1.between(10,50)

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
8    False
9     True
dtype: bool

### g) All Strings functions can be used to extract or modify texts in series

#### Upper and Lower function

In [20]:
ser = pd.Series(["JIya la" , "Data Science" , "GFG" , "Hello World" , "Machine Learning"])

In [21]:
print(ser.str.upper())

print(ser.str.lower())

0             JIYA LA
1        DATA SCIENCE
2                 GFG
3         HELLO WORLD
4    MACHINE LEARNING
dtype: object
0             jiya la
1        data science
2                 gfg
3         hello world
4    machine learning
dtype: object


#### Length of function

In [22]:
for i in ser:
    print(len(i))

7
12
3
11
16


#### Strip Function

In [23]:
ser = pd.Series(["   JIya la" , "     Data Science" , "GFG" , "   Hello World" , "Machine Learning"])

for i in ser:
    print(i ,len(i))
    
## here extra spaces will also get printed

   JIya la 10
     Data Science 17
GFG 3
   Hello World 14
Machine Learning 16


In [24]:
ser = ser.str.strip()

for i in ser:
      print(i ,len(i))

JIya la 7
Data Science 12
GFG 3
Hello World 11
Machine Learning 16


### Split function 

In [25]:
ser.str.split()[0]

['JIya', 'la']

#### COntains function - checks if anything is contained or not

In [26]:
ser = pd.Series(["Jiya la" , "Data@Science" , "GFG" , "Hello World" , "Machine Learning"])

ser.str.contains('@')

0    False
1     True
2    False
3    False
4    False
dtype: bool

#### Replace fucntion



In [27]:
ser = pd.Series(["Jiya la" , "Data@Science" , "GFG" , "Hello World" , "Machine Learning"])

ser.str.replace('@' , ' ')

0             Jiya la
1        Data Science
2                 GFG
3         Hello World
4    Machine Learning
dtype: object

### Count function
##### counts anything present in the string

In [28]:
ser.str.count('a')

0    2
1    2
2    0
3    0
4    2
dtype: int64

### startswith and endswith

In [29]:
print(ser.str.endswith('e'))
print(ser.str.startswith('D'))

0    False
1     True
2    False
3    False
4    False
dtype: bool
0    False
1     True
2    False
3    False
4    False
dtype: bool


#### Find Function
###### tells us if the word is present or not

In [30]:
ser = pd.Series(["Jiya la" , "Data@Science" , "GFG" , "Hello World" , "Machine Learning"])

ser.str.find('Data')

0   -1
1    0
2   -1
3   -1
4   -1
dtype: int64

### h) Converting a series into a list 

In [31]:
ser.to_list()

['Jiya la', 'Data@Science', 'GFG', 'Hello World', 'Machine Learning']

## Module 2 - Pandas Dataframe

#### Data frame is a two dimensional data structure , data is aligned in a tabular form in rows an columns , 

### a) Creating Dataframe

In [32]:
lst = ['Geeks' ,'for' ,'Geeks' , 'is' , 'a' ,'portal','for','Geeks']
pd.DataFrame(lst)

Unnamed: 0,0
0,Geeks
1,for
2,Geeks
3,is
4,a
5,portal
6,for
7,Geeks


In [33]:
lst = [["tom",10],["jerry" , 12] , ["spike" , 14]]
pd.DataFrame(lst)

Unnamed: 0,0,1
0,tom,10
1,jerry,12
2,spike,14


### Creating DataFrame from dict of ndarray/lists:
##### To create DataFrame from dict of narray/list, all the narray must be of same length. If index is passed then the length index should be equal to the length of arrays. If no index is passed, then by default, index will be range(n) where n is the array length

In [34]:
data = {'name':['Tom', 'nick', 'krish', 'jack'], 'age':[20, 21, 19, 18]}

pd.DataFrame(data)

Unnamed: 0,name,age
0,Tom,20
1,nick,21
2,krish,19
3,jack,18


A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. We can perform basic operations on rows/columns like selecting, deleting, adding, and renaming.
Column Selection: In Order to select a column in Pandas DataFrame, we can either access the columns by calling them by their columns name.

In [35]:
data = { 'Name'         :['Jai', 'Princi', 'Gaurav', 'Anuj'],
         'Age'          :[27, 24, 22, 32],
         'Address'      :['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
         'Qualification':['Msc', 'MA', 'MCA', 'Phd']}

df = pd.DataFrame(data)

df[['Name', 'Qualification']] ## selecting specific data columns

Unnamed: 0,Name,Qualification
0,Jai,Msc
1,Princi,MA
2,Gaurav,MCA
3,Anuj,Phd


### b) Slicing in DataFrames Using iloc and loc

##### 
Pandas comprises many methods for its proper functioning. loc() and iloc() are one of those methods. These are used in slicing data from the Pandas DataFrame. They help in the convenient selection of data from the DataFrame in Python. They are used in filtering the data according to some conditions.

In [36]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


### Basic loc Operations

Python loc() function The loc() function is label based data selecting method which means that we have to pass the name of the row or column which we want to select. This method includes the last element of the range passed in it, unlike iloc(). loc() can accept the boolean data unlike iloc(). Many operations can be performed using the loc() method like

In [37]:
df.loc[1:2]
df.loc[1:2 , "two" : "four"] ## can also slice columns the same way. just include names. 

Unnamed: 0,two,three,four
1,20,200,2000
2,30,300,3000


### Basic iloc Operations

The iloc() function is an indexed-based selecting method which means that we have to pass an integer index in the method to select a specific row/column. This method does not include the last element of the range passed in it unlike loc(). iloc() does not accept the boolean data unlike loc()

In [38]:
df.iloc[1:-1 , 1:-1] 

Unnamed: 0,two,three
1,20,200
2,30,300


you can see index 3 of both row and column has not been added here so 1 was inclusize but 3 is exclusive in the case of ilocs

Selecting Spefic Rows

In [39]:
df.iloc[[0,2],[1,3]]


Unnamed: 0,two,four
0,10,1000
2,30,3000


### c) Slicing Using Conditions

Using Conditions works with loc basically



In [40]:
df.loc[df['two'] > 20, ['three','four']]

## So we could extract only those data for which the value is more than 20
## For the columns we have used comma(,) to extract specifc columns which is 'three' and 'four'

Unnamed: 0,three,four
2,300,3000
3,400,4000


In [41]:

df.loc[df['three'] < 300, ['one','four']]

Unnamed: 0,one,four
0,1,1000
1,2,2000


### c) Column Addition in DataFrame

We can add a column in many ways. Let us discuss three ways how we can add column here

Using List

Using Pandas Series

Using an existing Column(we can modify that column in the way we want and that modified part can also be displayed)


In [42]:
## using list
l = [22,33,44,55]
df['five'] = l
df

Unnamed: 0,one,two,three,four,five
0,1,10,100,1000,22
1,2,20,200,2000,33
2,3,30,300,3000,44
3,4,40,400,4000,55


In [43]:
### using series
sr = pd.Series([111,222,333,444])
df['six'] = sr
df

Unnamed: 0,one,two,three,four,five,six
0,1,10,100,1000,22,111
1,2,20,200,2000,33,222
2,3,30,300,3000,44,333
3,4,40,400,4000,55,444


In [44]:
### Using an existing Column


df['seven'] = df['one'] + 10
df

Unnamed: 0,one,two,three,four,five,six,seven
0,1,10,100,1000,22,111,11
1,2,20,200,2000,33,222,12
2,3,30,300,3000,44,333,13
3,4,40,400,4000,55,444,14


### d) Column Deletion in Dataframes

In [45]:
del df['six']

df

Unnamed: 0,one,two,three,four,five,seven
0,1,10,100,1000,22,11
1,2,20,200,2000,33,12
2,3,30,300,3000,44,13
3,4,40,400,4000,55,14


In [46]:
## Using pop

df.pop('five')

df

Unnamed: 0,one,two,three,four,seven
0,1,10,100,1000,11
1,2,20,200,2000,12
2,3,30,300,3000,13
3,4,40,400,4000,14


### e) Addition of rows

In a Pandas DataFrame, you can add rows by using the append method. You can also create a new DataFrame with the desired row values and use the append to add the new row to the original dataframe. 

In [47]:
df1 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])


df3 = pd.concat([df1 , df2]).reset_index(drop = True)

df3


Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6
3,7,8


### f) Pandas drop function

In [48]:
df.drop([0,1], axis = 0)
## axis =0 => Rows (row wise)

Unnamed: 0,one,two,three,four,seven
2,3,30,300,3000,13
3,4,40,400,4000,14


In [49]:
### axis =1 => Columns (column wise)

df.drop(['one','three'], axis = 1, inplace = True)
df

Unnamed: 0,two,four,seven
0,10,1000,11
1,20,2000,12
2,30,3000,13
3,40,4000,14


### g) Transposing a DataFrame

The .T attribute in a Pandas DataFrame is used to transpose the dataframe, i.e., to flip the rows and columns

In [50]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [51]:
df.T

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400
four,1000,2000,3000,4000


### h) A set of more DataFrame Functionalities 

In [52]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


1. axes function

The .axes attribute in a Pandas DataFrame returns a list with the row and column labels of the DataFrame. The first element of the list is the row labels (index), and the second element is the column labels.



In [53]:
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['one', 'two', 'three', 'four'], dtype='object')]

2. ndim function
   
The .ndim attribute in a Pandas DataFrame returns the number of dimensions of the dataframe, which is always 2 for a DataFrame (row-and-column format).

In [54]:
df.ndim

2

3. dtypes

The .dtypes attribute in a Pandas DataFrame returns the data types of the columns in the DataFrame. The result is a Series with the column names as index and the data types of the columns as values.

In [55]:
df.dtypes

one      int64
two      int64
three    int64
four     int64
dtype: object

4. shape function

The .shape attribute in a Pandas DataFrame returns the dimensions (number of rows, number of columns) of the DataFrame as a tuple.

In [56]:
df.shape

(4, 4)

5. head() function

In [57]:
d = { 'Name'  :pd.Series(['Tom','Jerry','Spike','Popeye','Olive','Bluto','Mickey']),
      'Age'   :pd.Series([10,12,14,30,28,33,15]),
      'Height':pd.Series([3.25,1.11,4.12,5.47,6.15,6.67,2.61])}

df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Height
0,Tom,10,3.25
1,Jerry,12,1.11
2,Spike,14,4.12
3,Popeye,30,5.47
4,Olive,28,6.15
5,Bluto,33,6.67
6,Mickey,15,2.61


In [58]:
df.head(3)

Unnamed: 0,Name,Age,Height
0,Tom,10,3.25
1,Jerry,12,1.11
2,Spike,14,4.12


6. df.tail() function

In [59]:
df.tail(3)

Unnamed: 0,Name,Age,Height
4,Olive,28,6.15
5,Bluto,33,6.67
6,Mickey,15,2.61


7. empty function ()

indicates whether the data frame is empty or not. in the form of boolean

In [60]:
df = pd.DataFrame()

df.empty

True

### i) Statistical or Mathematical Functions

Sum 
 
 Mean 
 
 Median 
 
 Mode 
 
 Variance 
 
 Min 
 
 Max 
 
 Standard Deviation

In [61]:
data = {'one'   : pd.Series([1, 2, 3, 4]),
        'two'   : pd.Series([10, 20, 30, 40]),
        'three' : pd.Series([100, 200, 300, 400]),
        'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


1. Sum

In [62]:
df.sum()

one         10
two        100
three     1000
four     10000
dtype: int64

2. Mean

In [63]:
df.mean()

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64

3. Median 

In [64]:
df.median()

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64

4. Mode

In [65]:

de = pd.DataFrame({'A': [1, 2, 2, 3, 4, 4, 4, 5], 'B': [10, 20, 20, 30, 40, 40, 50, 60]})

print('A' , de['A'].mode())
print('B' , de['B'].mode())

A 0    4
Name: A, dtype: int64
B 0    20
1    40
Name: B, dtype: int64


5. Variance 

In [66]:
df.var()

one      1.666667e+00
two      1.666667e+02
three    1.666667e+04
four     1.666667e+06
dtype: float64

6. Min - Minimum

In [67]:
df.min()

one         1
two        10
three     100
four     1000
dtype: int64

7. Max

In [68]:
df.max()

one         4
two        40
three     400
four     4000
dtype: int64

8. Standard Deviation 

In [69]:
df.std()

one         1.290994
two        12.909944
three     129.099445
four     1290.994449
dtype: float64

###  j) Describe Function

The describe() method in a Pandas DataFrame returns descriptive statistics of the data in the DataFrame. It provides a quick summary of the central tendency, dispersion, and shape of the distribution of a set of numerical data.

In [70]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000]),
        'five' : pd.Series(['A','B','C','D'])}


df = pd.DataFrame(data)

df.describe()

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,2.5,25.0,250.0,2500.0
std,1.290994,12.909944,129.099445,1290.994449
min,1.0,10.0,100.0,1000.0
25%,1.75,17.5,175.0,1750.0
50%,2.5,25.0,250.0,2500.0
75%,3.25,32.5,325.0,3250.0
max,4.0,40.0,400.0,4000.0


### k) Pipe function

The pipe() method in a Pandas DataFrame allows you to apply a function to the DataFrame, similar to the way the apply() method works. The difference is that pipe() allows you to chain multiple operations together by passing the output of one function to the input of the next function.

In [71]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000])}



df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


Example 1

In [72]:
def add_(i,j):
    return i+j

def sub_(i,j):
    return i-j
    
df.pipe(add_,10) ## this wil take 2 args with add_ pass next as 10 so that in the complete data 10 will be added

Unnamed: 0,one,two,three,four
0,11,20,110,1010
1,12,30,210,2010
2,13,40,310,3010
3,14,50,410,4010


Example 2 - Mean square

In [73]:
def mean_(col):
    return col.mean()

def square(i):
    return i ** 2

df.pipe(mean_).pipe(square)

one            6.25
two          625.00
three      62500.00
four     6250000.00
dtype: float64

### 2. Apply Function

The apply() method in a Pandas DataFrame allows you to apply a function to the DataFrame, either to individual elements or to the entire DataFrame. The function can be either a built-in Python function or a user-defined function.

In [74]:
data = {'one'  : pd.Series([1, 2, 3, 4]),
        'two'  : pd.Series([10, 20, 30, 40]),
        'three': pd.Series([100, 200, 300, 400]),
        'four' : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [75]:
df.apply(np.mean)

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64

In [76]:
df.apply(lambda x: x.max() - x.min())

one         3
two        30
three     300
four     3000
dtype: int64

### 3. Apply map function

The map() method in a Pandas DataFrame allows you to apply a function to each element of a specific column of the DataFrame. The function can be either a built-in Python function or a user-defined function.

In [77]:

df.applymap(lambda x : x*100)

  df.applymap(lambda x : x*100)


Unnamed: 0,one,two,three,four
0,100,1000,10000,100000
1,200,2000,20000,200000
2,300,3000,30000,300000
3,400,4000,40000,400000


applymap and apply are both functions in the pandas library used for applying a function to elements of a pandas DataFrame or Series.

applymap is used to apply a function to every element of a DataFrame. It returns a new DataFrame where each element has been modified by the input function.

apply is used to apply a function along any axis of a DataFrame or Series. It returns either a Series or a DataFrame, depending on the axis along which the function is applied and the return value of the function. Unlike applymap, apply can take into account the context of the data, such as the row or column label.

So, applymap is meant for element-wise operations while apply can be used for both element-wise and row/column-wise operations.

In [78]:
df = pd.DataFrame({ 'A': [1.2, 3.4, 5.6], 
                    'B': [7.8, 9.1, 2.3]})

df_1 = df.applymap(np.int64)
print(df_1)

df_2 = df.apply(lambda row : row.mean(), axis = 0)
print(df_2)

   A  B
0  1  7
1  3  9
2  5  2
A    3.4
B    6.4
dtype: float64


  df_1 = df.applymap(np.int64)


### l) Reindex Function

The reindex function in Pandas is used to change the row labels and/or column labels of a DataFrame. This function can be used to align data from multiple DataFrames or to update the labels based on new data. The function takes in a list or an array of new labels as its first argument and, optionally, a fill value to replace any missing values. The reindexing can be done along either the row axis (0) or the column axis (1). The reindexed DataFrame is returned.

Example 1 - Reindex rows

In [79]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)

print(df.reindex([1,2,3,4,5]))


   one   two  three    four
1  2.0  20.0  200.0  2000.0
2  3.0  30.0  300.0  3000.0
3  4.0  40.0  400.0  4000.0
4  NaN   NaN    NaN     NaN
5  NaN   NaN    NaN     NaN


In [80]:
data = {'Name' : ['John', 'Jane', 'Jim', 'Joan'],
        'Age'  : [25, 30, 35, 40],
        'City' : ['New York', 'Los Angeles', 'Chicago', 'Houston']}

df = pd.DataFrame(data)

df.reindex(columns = ['Name','City','Age'])

Unnamed: 0,Name,City,Age
0,John,New York,25
1,Jane,Los Angeles,30
2,Jim,Chicago,35
3,Joan,Houston,40


### m) Renaming Columns in Pandas DataFrame

The rename function in Pandas is used to change the row labels and/or column labels of a DataFrame. It can be used to update the names of one or multiple rows or columns by passing a dictionary of new names as its argument. The dictionary should have the old names as keys and the new names as values

In [81]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)

df.rename(columns = {'one' : 'One','two': 'Two', 'three' : 'Three', 'four' : 'Four'}, 
           inplace = True, index = {0:'a',1:'b',2:'c',3:'d'})
df


Unnamed: 0,One,Two,Three,Four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


### n) Sorting in Pandas DataFrame
Pandas provides several methods to sort a DataFrame based on one or more columns.

sort_values: This method sorts the DataFrame based on one or more columns. The default sorting order is ascending, but you can change it to descending by passing the ascending argument with a value of False. bash

In [82]:
data = { 'one'   : pd.Series([23, 100, 45, 49]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,23,10,100,1000
1,100,20,200,2000
2,45,30,300,3000
3,49,40,400,4000


Sort in a specific column

In [83]:
df.sort_values(by = 'one') # here we mention a particular column we want to sort
# by default sorted in ascending order

Unnamed: 0,one,two,three,four
0,23,10,100,1000
2,45,30,300,3000
3,49,40,400,4000
1,100,20,200,2000


Sort with respect to a specific order

In [84]:
df.sort_values(by = 'one' , ascending = False)

Unnamed: 0,one,two,three,four
1,100,20,200,2000
3,49,40,400,4000
2,45,30,300,3000
0,23,10,100,1000


Sort in Scecific Order based on multiple Columns

In [85]:
df.sort_values(by = ['one','two'])

Unnamed: 0,one,two,three,four
0,23,10,100,1000
2,45,30,300,3000
3,49,40,400,4000
1,100,20,200,2000


Sort with Specific Sorting Algorithm:<br>

quicksort
mergesort
heapsort

In [86]:
df.sort_values(by = ['one'], kind = 'mergesort')

Unnamed: 0,one,two,three,four
0,23,10,100,1000
2,45,30,300,3000
3,49,40,400,4000
1,100,20,200,2000


### o) Groupby Functions

The groupby function in pandas is used to split a dataframe into groups based on one or more columns. It returns a DataFrameGroupBy object, which is similar to a DataFrame but has some additional methods to perform operations on the grouped data

In [87]:
cricket = {'Team'   : ['India', 'India', 'Australia', 'Australia', 'SA', 'SA', 'SA', 'SA', 'NZ', 'NZ', 'NZ', 'India'],
           'Rank'   : [2, 3, 1,2, 3,4 ,1 ,1,2 , 4,1,2],
           'Year'   : [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
           'Points' : [876,801,891,815,776,784,834,824,758,691,883,782]}

df = pd.DataFrame(cricket)
df

Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
2,Australia,1,2014,891
3,Australia,2,2015,815
4,SA,3,2014,776
5,SA,4,2015,784
6,SA,1,2016,834
7,SA,1,2017,824
8,NZ,2,2016,758
9,NZ,4,2014,691


In [88]:
df.groupby('Team').groups # groups by team as parameter returns index at which particular team is present

{'Australia': [2, 3], 'India': [0, 1, 11], 'NZ': [8, 9, 10], 'SA': [4, 5, 6, 7]}

To search for specific Country with specific year

In [89]:
df.groupby(['Team','Year']).get_group(('Australia',2014))  # returns dataframe with country and year searched for

Unnamed: 0,Team,Rank,Year,Points
2,Australia,1,2014,891


If the data is not present then we will be getting an error

Adding some statistical computation on top of groupby

In [90]:
df.groupby('Team').sum()['Points'] # returns the sum of points for each team

Team
Australia    1706
India        2459
NZ           2332
SA           3218
Name: Points, dtype: int64

This means we have displayed the teams which are having the maximum sum in Poitns

Let us sort it to get it in a better way

In [91]:
df.groupby('Team').sum()['Points'].sort_values(ascending = False)
# sorts in descending order. groups by team and using math sum function calulates sum for points

Team
SA           3218
India        2459
NZ           2332
Australia    1706
Name: Points, dtype: int64

Checking multiple stats for points team wise

In [92]:
groups = df.groupby('Team')

groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])

# checks multiple stats using the aggregate function, and finds different stats for points column for each team

  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])
  groups['Points'].agg([np.sum, np.mean, np.std,np.max,np.min])


Unnamed: 0_level_0,sum,mean,std,max,min
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Australia,1706,853.0,53.740115,891,815
India,2459,819.666667,49.702448,876,782
NZ,2332,777.333333,97.449132,883,691
SA,3218,804.5,28.769196,834,776


filter function along with groupby

In [93]:
df.groupby('Team').filter(lambda x : len(x) == 3)

# returns data by filtering and using the filter function. filters according to the len

Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
8,NZ,2,2016,758
9,NZ,4,2014,691
10,NZ,1,2015,883
11,India,2,2017,782


### 3. Working with csv files and basic data Analysis Using Pandas

a) Reading csv
Reading csv files from local system

In [94]:
df = pd.read_csv('Datasets/Football.csv')
df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


#### Reading CSV files from github repositories
NOTE: The link of the page should be copied when the file is in raw format

In [95]:
link = "https://raw.githubusercontent.com/GeeksforgeeksDS/In-One-Go/refs/heads/main/Pandas/Football.csv"
file = pd.read_csv(link)
file.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


### b) Pandas Info Function
Pandas dataframe.info() function is used to get a concise summary of the dataframe. It comes really handy when doing exploratory analysis of the data. To get a quick overview of the dataset we use the dataframe.info() function.

Syntax: DataFrame.info(verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None)

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Country                  660 non-null    object 
 1   League                   660 non-null    object 
 2   Club                     626 non-null    object 
 3   Player Names             660 non-null    object 
 4   Matches_Played           660 non-null    int64  
 5   Substitution             660 non-null    int64  
 6   Mins                     660 non-null    int64  
 7   Goals                    660 non-null    int64  
 8   xG                       660 non-null    float64
 9   xG Per Avg Match         660 non-null    float64
 10  Shots                    660 non-null    int64  
 11  OnTarget                 660 non-null    int64  
 12  Shots Per Avg Match      660 non-null    float64
 13  On Target Per Avg Match  660 non-null    float64
 14  Year                     6

### c) isnull() function to check if there are nan values present

In [97]:
df.isnull()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
656,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
657,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
658,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


So we can see we are getting a boolean kind of a table giving True and False
If we use the sum function along with it then we can get how many null values are present in each columns

In [98]:
df.isnull().sum()

Country                     0
League                      0
Club                       34
Player Names                0
Matches_Played              0
Substitution                0
Mins                        0
Goals                       0
xG                          0
xG Per Avg Match            0
Shots                       0
OnTarget                    0
Shots Per Avg Match         0
On Target Per Avg Match     0
Year                        0
dtype: int64

### d) Quantile function to get the specific percentile value
Let us check the 80 percentile value of each columns using describe function first

In [99]:
df.describe(percentiles=[.80])

Unnamed: 0,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,22.371212,3.224242,2071.416667,11.810606,10.089606,0.476167,64.177273,28.365152,2.948015,1.315652,2018.363636
std,9.754658,3.839498,900.595049,6.075315,5.724844,0.192831,34.941622,16.363149,0.914906,0.474239,1.3677
min,2.0,0.0,264.0,2.0,0.71,0.07,5.0,2.0,0.8,0.24,2016.0
50%,24.0,2.0,2245.5,11.0,9.285,0.435,62.0,26.0,2.845,1.25,2019.0
80%,32.0,6.0,2915.8,15.0,14.076,0.61,90.0,39.0,3.6,1.63,2020.0
max,38.0,26.0,4177.0,42.0,32.54,1.35,208.0,102.0,7.2,3.63,2020.0



Let us use the quantile function to get the exact value now

In [100]:
df['Mins'].quantile(.70)

np.float64(2694.3)

This funciton is important as it can be used to treat ourliers in Data Science EDA process

### e) Copy function

If we normal do:
de=df
Then a change in de will affect the data of df as well so we need to copy in such a way that it creates a totally new object and does not affect the old dataframe

In [101]:
de = df.copy()
de.head(3)

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016


In [102]:
de['Year+100'] = de['Year'] + 100
de.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year,Year+100
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016,2116
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016,2116
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016,2116
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016,2116
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016,2116


So we can see a new column has been added here but our old data is secured

In [103]:
df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


The new column is not present here

### f) Value Counts function

Pandas Series.value_counts() function return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default.

Syntax: Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True)

In [104]:
df['Player Names'].value_counts()

Player Names
Andrea Belotti     5
Lionel Messi       5
Luis Suarez        5
Andrej Kramaric    5
Ciro Immobile      5
                  ..
Francois Kamano    1
Lebo Mothiba       1
Gaetan Laborde     1
Falcao             1
Cody Gakpo         1
Name: count, Length: 444, dtype: int64

In [105]:
df['Player Names'].unique() # returns all unique names

array(['Juanmi Callejon', 'Antoine Griezmann', 'Luis Suarez',
       'Ruben Castro', 'Kevin Gameiro', 'Cristiano Ronaldo',
       'Karim Benzema', 'Neymar ', 'Iago Aspas', 'Sergi Enrich',
       'Aduriz ', 'Sandro Ramlrez', 'Lionel Messi', 'Gerard Moreno',
       'Morata', 'Wissam Ben Yedder', 'Willian Jose', 'Andone ',
       'Cedric Bakambu', 'Isco', 'Mohamed Salah', 'Gregoire Defrel',
       'Ciro Immobile', 'Nikola Kalinic', 'Dries Mertens',
       'Alejandro Gomez', 'Jose CallejOn', 'Iago Falque',
       'Giovanni Simeone', 'Mauro Icardi', 'Diego Falcinelli',
       'Cyril Thereau', 'Edin Dzeko', 'Lorenzo Insigne',
       'Fabio Quagliarella', 'Borriello ', 'Carlos Bacca',
       'Gonzalo Higuain', 'Keita Balde', 'Andrea Belotti', 'Fin Bartels',
       'Lars Stindl', 'Serge Gnabry', 'Wagner ', 'Andrej Kramaric',
       'Florian Niederlechner', 'Robert Lewandowski', 'Emil Forsberg',
       'Timo Werner', 'Nils Petersen', 'Vedad Ibisevic', 'Mario Gomez',
       'Maximilian Philipp',

In [106]:
df['Player Names'].nunique() # total no unique names

444

### h) dropna() function

In [107]:
df=pd.read_csv("Datasets/googleplaystore.csv")
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [108]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [109]:
df.dropna(inplace = True, axis = 0)
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6M,500+,Free,0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


deletes rows with null values (rowwise - axis = 0)

In [110]:
df.dropna(inplace = True, axis = 1)
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6M,500+,Free,0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


deletes columns with null values (columnwise - axis = 0)

### i) Fillna Function

Pandas Series.fillna() function is used to fill NA/NaN values using the specified method.

Suppose if we want to fill the null values with something instead of removing them then we can use fillna function
Here we will be filling the numerical columns with its mean values and Categorical columns with its mode

In [111]:
df=pd.read_csv("Datasets/googleplaystore.csv")
df.isnull().sum()

df['Rating']

0        4.1
1        3.9
2        4.7
3        4.5
4        4.3
        ... 
10836    4.5
10837    5.0
10838    NaN
10839    4.5
10840    4.5
Name: Rating, Length: 10841, dtype: float64

 #### Filling

Numerical columns

In [112]:
miss = round(df['Rating'].mean(),2)
df['Rating'].fillna(miss , inplace = True)

df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.10,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.90,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.70,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.50,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.30,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.50,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.00,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,4.19,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.50,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [113]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64

Categorical values

In [114]:
df['Current Ver'] = df['Current Ver'].fillna('Varies on Device')
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.10,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.90,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.70,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.50,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.30,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.50,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.00,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,4.19,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.50,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


### j) sample function
Pandas sample() is used to generate a sample random row or column from the function caller data frame.

Syntax:

DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)

df.sample(5)

In [115]:
df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
55,Tickets SDA 2018 and Exam from the State Traff...,AUTO_AND_VEHICLES,4.9,10479,33M,"100,000+",Free,0,Everyone,Auto & Vehicles,"July 18, 2018",1.7.1,4.0 and up
5401,AO Surgery Reference,MEDICAL,4.7,2570,3.8M,"100,000+",Free,0,Everyone,Medical,"May 10, 2017",1.2.6,4.0 and up
5725,Professional AV Solution & Products information,PRODUCTIVITY,4.5,24,22M,"1,000+",Free,0,Everyone,Productivity,"July 12, 2017",2.0.1,4.2 and up
10077,EY GlobalOne,BUSINESS,2.7,11,6.8M,"1,000+",Free,0,Everyone,Business,"November 27, 2017",3.1.0,2.2 and up
2760,Zappos – Shoe shopping made simple,SHOPPING,4.5,44588,20M,"5,000,000+",Free,0,Everyone,Shopping,"July 12, 2018",9.0.0,4.4 and up


### k) to_csv() function
Pandas Series.to_csv() function write the given series object to a comma-separated values (csv) file/format.

Syntax: Series.to_csv(*args, **kwargs)

In [116]:
data = { 'one'   : pd.Series([1, 2, 3, 4]),
         'two'   : pd.Series([10, 20, 30, 40]),
         'three' : pd.Series([100, 200, 300, 400]),
         'four'  : pd.Series([1000, 2000, 3000, 4000])}

df = pd.DataFrame(data)

df.to_csv('Datasets/Number.csv',index = False)

## when index is False : it wont return any unnamed indexes

## 4. A detailed Pandas Profile report
The pandas_profiling library in Python include a method named as ProfileReport() which generate a basic report on the input DataFrame.

The report consist of the following:

DataFrame overview, Each attribute on which DataFrame is defined, Correlations between attributes (Pearson Correlation and Spearman Correlation), and A sample of DataFrame.