In [1]:
#pandas - pandas(all lowercase) is a popular python based data analysis toolkit which can be imported using import pandas as pd. 
#it presents a diverse range of utilities, ranging from parsing multiple file format to converting an entire data table into a 
#numpy matrix array this makes pandas a trusted ally in data science and machine learning.

In [5]:
import pandas as pd
import numpy as np

In [7]:
#lets create a blank series pandas

In [11]:
series=pd.Series()

In [13]:
series #datatype=object

Series([], dtype: object)

In [15]:
series=pd.Series([2,3,4,11,13,24,25,30,34,35,37,49,50,60])

In [19]:
print(series) #datatype=int

0      2
1      3
2      4
3     11
4     13
5     24
6     25
7     30
8     34
9     35
10    37
11    49
12    50
13    60
dtype: int64


In [21]:
#notice datatype=object if u pass any string

series=pd.Series([0,1,2,'3'])
series

0    0
1    1
2    2
3    3
dtype: object

In [29]:
#create dictionary

dict={'name':['ishika', 'pooja'], 'age' : [22,23], 'school' : ['DPS','DPS']} 
dict

{'name': ['ishika', 'pooja'], 'age': [22, 23], 'school': ['DPS', 'DPS']}

In [33]:
#create data frame
df=pd.DataFrame(dict)
print(df)

     name  age school
0  ishika   22    DPS
1   pooja   23    DPS


In [35]:
#add new columns and values to it
#add new column new salary to it
#split on the basis of :

df['salary']='22:33'.split(':')

In [37]:
print(df)

     name  age school salary
0  ishika   22    DPS     22
1   pooja   23    DPS     33


In [49]:
#how to create nan values and its manipulation
#nan=not a number; filled by computer when value is not provided

dict={'name':['ishika', 'pooja', 'raj'], 'age' : [22,23, np.nan], 'school' : ['DPS','DPS', np.nan]} 
dict

{'name': ['ishika', 'pooja', 'raj'],
 'age': [22, 23, nan],
 'school': ['DPS', 'DPS', nan]}

In [105]:
df=pd.DataFrame(dict)
print(df)

     name   age school
0  ishika  22.0    DPS
1   pooja  23.0    DPS
2     raj   NaN    NaN


In [107]:
#create one more column

df['salary']='22,33,34'.split(',')
df

Unnamed: 0,name,age,school,salary
0,ishika,22.0,DPS,22
1,pooja,23.0,DPS,33
2,raj,,,34


In [57]:
#how to drop nan values from pandas
#either drop(Delete) or fill the value
#dropna= drop null values

print("\n Dropping any column with nan value\n",'-'*35,sep='')
print(df.dropna(axis=1))


 Dropping any column with nan value
-----------------------------------
     name salary
0  ishika     22
1   pooja     33
2     raj     34


In [59]:
#only column agw and school drop bcz they hv null values

In [61]:
print("\n Dropping any row with nan value\n",'-'*35,sep='')
print(df.dropna(axis=0))


 Dropping any row with nan value
-----------------------------------
     name   age school salary
0  ishika  22.0    DPS     22
1   pooja  23.0    DPS     33


In [63]:
print(df) #there is no chg in og data table bcz we did not save the data

     name   age school salary
0  ishika  22.0    DPS     22
1   pooja  23.0    DPS     33
2     raj   NaN    NaN     34


In [65]:
#fillna= fills all null values with any value you pass in brackets

print("\n Filling any column with nan value\n",'-'*35,sep='')
print(df.fillna(value=10))


 Filling any column with nan value
-----------------------------------
     name   age school salary
0  ishika  22.0    DPS     22
1   pooja  23.0    DPS     33
2     raj  10.0     10     34


In [67]:
print(df) #no permenant chg

     name   age school salary
0  ishika  22.0    DPS     22
1   pooja  23.0    DPS     33
2     raj   NaN    NaN     34


In [69]:
df

Unnamed: 0,name,age,school,salary
0,ishika,22.0,DPS,22
1,pooja,23.0,DPS,33
2,raj,,,34


In [109]:
#another way to fill null values using mean
print("\n Filling Values with a computed values (mean of a column A here)\n",'-'*60,sep='')

#calculate the mean of 4th column salary and fill all the null values with the mean
#inplace=True means we save our data for future(PERMANENT CHANGE)
df.fillna(value=df['age'].mean(),inplace=True) 
print(df)


 Filling Values with a computed values (mean of a column A here)
------------------------------------------------------------
     name   age school salary
0  ishika  22.0    DPS     22
1   pooja  23.0    DPS     33
2     raj  22.5   22.5     34


In [89]:
#salary does not have any null values, the fillna() method might not make sense here. 
#However, the primary issue is probably because fillna() is trying to replace null values 
#in the entire DataFrame, not just in the salary column.

print("\n Filling Values with a computed values (mean of a column A here)\n",'-'*60,sep='')

#calculate the mean of 4th column salary and fill all the null values with the mean
#inplace=True means we save our data for future(PERMANENT CHANGE)
df.fillna(value=df['salary'].mean(),inplace=True) 
print(df)


 Filling Values with a computed values (mean of a column A here)
------------------------------------------------------------


KeyError: 'salary'

In [73]:
#group by method =group same data together
#create data frame

In [111]:
data={'office':['Goog','Goog','MSFT','MSFT','FB','FB'],
      'name':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
     'Sales':[200,120,340,124,243,350]}

In [113]:
df=pd.DataFrame(data)
print(df)

  office     name  Sales
0   Goog      Sam    200
1   Goog  Charlie    120
2   MSFT      Amy    340
3   MSFT  Vanessa    124
4     FB     Carl    243
5     FB    Sarah    350


In [85]:
#Lets find the mean company wise. firstly we have to group by column names using groupby

In [87]:
#cannot calculate mean of name column bcz it is string data 
#but can calculate mean of sales column bcz int
#therefore error

bycomp=df.groupby('office')
print("\n Grouping by 'company' column and listing mean sales\n",'-'*55,sep='')
print(bycomp.mean())


 Grouping by 'company' column and listing mean sales
-------------------------------------------------------


TypeError: agg function failed [how->mean,dtype->object]

In [119]:
#use shift+tab within the bracket of bycomp.mean()
#it shows arguments that can be passed 
#we use numeric only to calculate mean of only numerical data

bycomp=df.groupby('office')
print("\n Grouping by 'company' column and listing mean sales\n",'-'*55,sep='')
print(bycomp.mean(numeric_only=True))


 Grouping by 'company' column and listing mean sales
-------------------------------------------------------
        Sales
office       
FB      296.5
Goog    160.0
MSFT    232.0


In [115]:
bycomp=df.groupby('office')
print("\n Grouping by 'company' column and listing sum of sales\n",'-'*55,sep='')
print(bycomp.sum())


 Grouping by 'company' column and listing sum of sales
-------------------------------------------------------
              name  Sales
office                   
FB       CarlSarah    593
Goog    SamCharlie    320
MSFT    AmyVanessa    464


In [121]:
#Concating DataFrame 
#There are Three types of concating  -1 = Concat
#                                     2  = Merge

In [153]:
class_a=pd.DataFrame({'Mat':[45,68,35,65],
                     'Eng':[54,87,36,52],
                     'Sci':[66,77,88,70],
                     'Chem':[54,79,58,76]},
                    index=['Raj','Shila','Sakshi','Abhishek'])
class_a

Unnamed: 0,Mat,Eng,Sci,Chem
Raj,45,54,66,54
Shila,68,87,77,79
Sakshi,35,36,88,58
Abhishek,65,52,70,76


In [155]:
class_b=pd.DataFrame({'Mat':[50,87,69,71],
                     'Eng':[66,50,87,69],
                     'Sci':[34,66,69,71],
                     'Chem':[54,87,34,66]},
                    index=['Shaik','Mahir','Vikas','Mina'])
class_b

Unnamed: 0,Mat,Eng,Sci,Chem
Shaik,50,66,34,54
Mahir,87,50,66,87
Vikas,69,87,69,34
Mina,71,69,71,66


In [157]:
class_c=pd.DataFrame({'Mat':[66,50,87,56],
                     'Eng':[56,87,89,71],
                     'Sci':[77,88,70,52],
                     'Chem':[34,56,56,87]},
                    index=['Aish','Ronak','Ram','Sher'])
class_c

Unnamed: 0,Mat,Eng,Sci,Chem
Aish,66,56,77,34
Ronak,50,87,88,56
Ram,87,89,70,56
Sher,56,71,52,87


In [159]:
#axis=0 ---> concatenate along rows
#create variable with name ---> all_student_marks1 (its stores dataframes)

all_student_marks1=pd.concat([class_a,class_b,class_c],axis=0)
print("\n After concatenation along row \n",'-'*30,sep='')
print(all_student_marks1)


 After concatenation along row 
------------------------------
          Mat  Eng  Sci  Chem
Raj        45   54   66    54
Shila      68   87   77    79
Sakshi     35   36   88    58
Abhishek   65   52   70    76
Shaik      50   66   34    54
Mahir      87   50   66    87
Vikas      69   87   69    34
Mina       71   69   71    66
Aish       66   56   77    34
Ronak      50   87   88    56
Ram        87   89   70    56
Sher       56   71   52    87


In [171]:
#axis=1 ---> concatenate along columns
#The all_student_marks2 variable is independent of any other DataFrame (df).

all_student_marks2=pd.concat([class_a,class_b,class_c],axis=1)
print("\n After concatenation along column \n",'-'*30,sep='')
print(all_student_marks2)


 After concatenation along column 
------------------------------
           Mat   Eng   Sci  Chem   Mat   Eng   Sci  Chem   Mat   Eng   Sci  \
Raj       45.0  54.0  66.0  54.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Shila     68.0  87.0  77.0  79.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Sakshi    35.0  36.0  88.0  58.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Abhishek  65.0  52.0  70.0  76.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Shaik      NaN   NaN   NaN   NaN  50.0  66.0  34.0  54.0   NaN   NaN   NaN   
Mahir      NaN   NaN   NaN   NaN  87.0  50.0  66.0  87.0   NaN   NaN   NaN   
Vikas      NaN   NaN   NaN   NaN  69.0  87.0  69.0  34.0   NaN   NaN   NaN   
Mina       NaN   NaN   NaN   NaN  71.0  69.0  71.0  66.0   NaN   NaN   NaN   
Aish       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  66.0  56.0  77.0   
Ronak      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  50.0  87.0  88.0   
Ram        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  87.0  89.0  70.0 

In [175]:
#However, the result is being assigned as an attribute to an existing DataFrame df, 
#creating all_student_marks2 as a custom attribute of df. (not a variable or dataframe but rather an attribute of df)

df.all_student_marks2=pd.concat([class_a,class_b,class_c],axis=1)
print("\n After concatenation along column \n",'-'*30,sep='')
print(all_student_marks2)


 After concatenation along column 
------------------------------
           Mat   Eng   Sci  Chem   Mat   Eng   Sci  Chem   Mat   Eng   Sci  \
Raj       45.0  54.0  66.0  54.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Shila     68.0  87.0  77.0  79.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Sakshi    35.0  36.0  88.0  58.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Abhishek  65.0  52.0  70.0  76.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
Shaik      NaN   NaN   NaN   NaN  50.0  66.0  34.0  54.0   NaN   NaN   NaN   
Mahir      NaN   NaN   NaN   NaN  87.0  50.0  66.0  87.0   NaN   NaN   NaN   
Vikas      NaN   NaN   NaN   NaN  69.0  87.0  69.0  34.0   NaN   NaN   NaN   
Mina       NaN   NaN   NaN   NaN  71.0  69.0  71.0  66.0   NaN   NaN   NaN   
Aish       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  66.0  56.0  77.0   
Ronak      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  50.0  87.0  88.0   
Ram        NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  87.0  89.0  70.0 

In [181]:
#lets fill all nan values
df.all_student_marks2.fillna(value=0,inplace=True)
print(df.all_student_marks2)

           Mat   Eng   Sci  Chem   Mat   Eng   Sci  Chem   Mat   Eng   Sci  \
Raj       45.0  54.0  66.0  54.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
Shila     68.0  87.0  77.0  79.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
Sakshi    35.0  36.0  88.0  58.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
Abhishek  65.0  52.0  70.0  76.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
Shaik      0.0   0.0   0.0   0.0  50.0  66.0  34.0  54.0   0.0   0.0   0.0   
Mahir      0.0   0.0   0.0   0.0  87.0  50.0  66.0  87.0   0.0   0.0   0.0   
Vikas      0.0   0.0   0.0   0.0  69.0  87.0  69.0  34.0   0.0   0.0   0.0   
Mina       0.0   0.0   0.0   0.0  71.0  69.0  71.0  66.0   0.0   0.0   0.0   
Aish       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  66.0  56.0  77.0   
Ronak      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  50.0  87.0  88.0   
Ram        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  87.0  89.0  70.0   
Sher       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  56.0  

In [183]:
#Merge (use pd.merge)

In [185]:
user_id_name=pd.DataFrame({'ID':[25,30,35,40,45,50,55],
                            'Name':['Rajshree','Mistri','Patel','Mardana','Harish','Potla','Chhetri']})

In [187]:
user_id_name

Unnamed: 0,ID,Name
0,25,Rajshree
1,30,Mistri
2,35,Patel
3,40,Mardana
4,45,Harish
5,50,Potla
6,55,Chhetri


In [189]:
player_detail=pd.DataFrame({'ID':[25,30,45,50,55],
                            'Age':[36,34,20,28,37],
                            'Country':['delhi','noida','mumbai','Faridabad','gujrat']})

In [191]:
player_detail

Unnamed: 0,ID,Age,Country
0,25,36,delhi
1,30,34,noida
2,45,20,mumbai
3,50,28,Faridabad
4,55,37,gujrat


In [193]:
#player detail doesnt hv id=35,45 which are not included in merge list

pd.merge(user_id_name,player_detail)

Unnamed: 0,ID,Name,Age,Country
0,25,Rajshree,36,delhi
1,30,Mistri,34,noida
2,45,Harish,20,mumbai
3,50,Potla,28,Faridabad
4,55,Chhetri,37,gujrat


In [195]:
pd.merge(user_id_name,player_detail, on='ID')

Unnamed: 0,ID,Name,Age,Country
0,25,Rajshree,36,delhi
1,30,Mistri,34,noida
2,45,Harish,20,mumbai
3,50,Potla,28,Faridabad
4,55,Chhetri,37,gujrat


In [197]:
#join

In [209]:
girls=pd.DataFrame({'name1':['pooja','rani'],
                    'age1' : [23,21]},
                    index = [1,2])
girls

Unnamed: 0,name1,age1
1,pooja,23
2,rani,21


In [211]:
boys=pd.DataFrame({'name2':['ram','ravan','lakshman'],
                    'age2' : [33,11,26]},
                    index = [1,2,3])
boys

Unnamed: 0,name2,age2
1,ram,33
2,ravan,11
3,lakshman,26


In [213]:
#column names for both data frames shld be different or it gives error
#only joins common indexes of both datafranes

girls.join(boys)

Unnamed: 0,name1,age1,name2,age2
1,pooja,23,ram,33
2,rani,21,ravan,11


In [217]:
#used to join all(common and uncommon) indexes of both dataframes

girls.join(boys, how='outer')

Unnamed: 0,name1,age1,name2,age2
1,pooja,23.0,ram,33
2,rani,21.0,ravan,11
3,,,lakshman,26
