# Pandas

In [171]:
import pandas as pd
import numpy as np

## Generating data using pandas

In [172]:
df = pd.DataFrame(data = np.random.randn(5,4), columns = ['w','x','y','z'], index = ['a','b','c','d','e'])
df

Unnamed: 0,w,x,y,z
a,-1.758982,0.542945,1.649626,-1.691031
b,-1.422586,0.957756,0.967391,0.893685
c,-1.089396,1.092357,0.285002,-1.019985
d,-0.961794,0.739714,-1.051779,-0.157403
e,1.250523,-0.171233,0.011222,1.48632


## Extract the values of column 'w'

In [173]:
df['w']

a   -1.758982
b   -1.422586
c   -1.089396
d   -0.961794
e    1.250523
Name: w, dtype: float64

In [174]:
type(df['w'])

pandas.core.series.Series

In [175]:
X = df['w'].values  # Extracting values in the form of numpy array

In [176]:
print(X)

[-1.75898175 -1.42258629 -1.08939639 -0.96179385  1.25052349]


In [177]:
type(X)

numpy.ndarray

## Extracting two columns

In [178]:
df[['w','x']]

Unnamed: 0,w,x
a,-1.758982,0.542945
b,-1.422586,0.957756
c,-1.089396,1.092357
d,-0.961794,0.739714
e,1.250523,-0.171233


## Find the names of the columns

In [179]:
df.columns

Index(['w', 'x', 'y', 'z'], dtype='object')

In [180]:
df

Unnamed: 0,w,x,y,z
a,-1.758982,0.542945,1.649626,-1.691031
b,-1.422586,0.957756,0.967391,0.893685
c,-1.089396,1.092357,0.285002,-1.019985
d,-0.961794,0.739714,-1.051779,-0.157403
e,1.250523,-0.171233,0.011222,1.48632


## Adding new column

In [181]:
df['Z'] = [1,2,3,4,5] 

In [182]:
df

Unnamed: 0,w,x,y,z,Z
a,-1.758982,0.542945,1.649626,-1.691031,1
b,-1.422586,0.957756,0.967391,0.893685,2
c,-1.089396,1.092357,0.285002,-1.019985,3
d,-0.961794,0.739714,-1.051779,-0.157403,4
e,1.250523,-0.171233,0.011222,1.48632,5


## Dropping a column

In [183]:
df.drop(['Z'],axis=1)

Unnamed: 0,w,x,y,z
a,-1.758982,0.542945,1.649626,-1.691031
b,-1.422586,0.957756,0.967391,0.893685
c,-1.089396,1.092357,0.285002,-1.019985
d,-0.961794,0.739714,-1.051779,-0.157403
e,1.250523,-0.171233,0.011222,1.48632


In [184]:
df

Unnamed: 0,w,x,y,z,Z
a,-1.758982,0.542945,1.649626,-1.691031,1
b,-1.422586,0.957756,0.967391,0.893685,2
c,-1.089396,1.092357,0.285002,-1.019985,3
d,-0.961794,0.739714,-1.051779,-0.157403,4
e,1.250523,-0.171233,0.011222,1.48632,5


# Permanently drop the column

In [185]:
df.drop(['Z'],axis=1, inplace = True)

In [186]:
df

Unnamed: 0,w,x,y,z
a,-1.758982,0.542945,1.649626,-1.691031
b,-1.422586,0.957756,0.967391,0.893685
c,-1.089396,1.092357,0.285002,-1.019985
d,-0.961794,0.739714,-1.051779,-0.157403
e,1.250523,-0.171233,0.011222,1.48632


## Droping a row

In [187]:
df.drop(['a'],axis=0)

Unnamed: 0,w,x,y,z
b,-1.422586,0.957756,0.967391,0.893685
c,-1.089396,1.092357,0.285002,-1.019985
d,-0.961794,0.739714,-1.051779,-0.157403
e,1.250523,-0.171233,0.011222,1.48632


In [188]:
df

Unnamed: 0,w,x,y,z
a,-1.758982,0.542945,1.649626,-1.691031
b,-1.422586,0.957756,0.967391,0.893685
c,-1.089396,1.092357,0.285002,-1.019985
d,-0.961794,0.739714,-1.051779,-0.157403
e,1.250523,-0.171233,0.011222,1.48632


In [189]:
df.drop(['a'],axis=0, inplace = True)


In [190]:
df

Unnamed: 0,w,x,y,z
b,-1.422586,0.957756,0.967391,0.893685
c,-1.089396,1.092357,0.285002,-1.019985
d,-0.961794,0.739714,-1.051779,-0.157403
e,1.250523,-0.171233,0.011222,1.48632


In [191]:
df = pd.DataFrame(data = np.random.randn(5,4), columns = ['w','x','y','z'], index = ['a','b','c','d','e'])
df

Unnamed: 0,w,x,y,z
a,-0.231746,0.497923,0.536086,-1.74487
b,1.700029,-0.584681,0.174853,-0.54177
c,0.718119,-0.248444,-1.22864,-0.488446
d,-0.800841,-1.434054,0.284097,1.222273
e,1.208924,1.022983,-2.730695,-0.296092


## use .loc to find the values in a row or column or both 

In [192]:
df.loc['a']

w   -0.231746
x    0.497923
y    0.536086
z   -1.744870
Name: a, dtype: float64

In [193]:
df.loc['a','w']

-0.23174561951588243

## Use integer location i.e iloc

In [194]:
df.iloc[0,0]

-0.23174561951588243

In [195]:
df.iloc[2,3] = 0
df

Unnamed: 0,w,x,y,z
a,-0.231746,0.497923,0.536086,-1.74487
b,1.700029,-0.584681,0.174853,-0.54177
c,0.718119,-0.248444,-1.22864,0.0
d,-0.800841,-1.434054,0.284097,1.222273
e,1.208924,1.022983,-2.730695,-0.296092


In [196]:
df

Unnamed: 0,w,x,y,z
a,-0.231746,0.497923,0.536086,-1.74487
b,1.700029,-0.584681,0.174853,-0.54177
c,0.718119,-0.248444,-1.22864,0.0
d,-0.800841,-1.434054,0.284097,1.222273
e,1.208924,1.022983,-2.730695,-0.296092


In [197]:
df.iloc[1:3,1:3]


Unnamed: 0,x,y
b,-0.584681,0.174853
c,-0.248444,-1.22864


## Inserting new row

In [198]:
df.loc['f'] =[1,2,3,4] 

In [199]:
df

Unnamed: 0,w,x,y,z
a,-0.231746,0.497923,0.536086,-1.74487
b,1.700029,-0.584681,0.174853,-0.54177
c,0.718119,-0.248444,-1.22864,0.0
d,-0.800841,-1.434054,0.284097,1.222273
e,1.208924,1.022983,-2.730695,-0.296092
f,1.0,2.0,3.0,4.0


## Permanently drop row f

In [200]:
df.drop(['f'],axis=0, inplace = True)

In [201]:
df

Unnamed: 0,w,x,y,z
a,-0.231746,0.497923,0.536086,-1.74487
b,1.700029,-0.584681,0.174853,-0.54177
c,0.718119,-0.248444,-1.22864,0.0
d,-0.800841,-1.434054,0.284097,1.222273
e,1.208924,1.022983,-2.730695,-0.296092


# Generating another data using pandas

In [202]:
df = pd.DataFrame({"name":["William","Emma","Sofia","Markus","Edward","Thomas","Ethan","Olivia","Arun","Anika","Paulo"]
,"region":["East",np.nan,"East","South","West","West","South","West","West","East","South"]
,"sales":[50000,52000,90000,np.nan,42000,72000,49000,np.nan,67000,65000,67000]
,"expenses":[42000,43000,np.nan,44000,38000,39000,42000,np.nan,39000,44000,45000]
,"lossValues":[12000,'?','?',np.nan,11000,33000,44000,10000,22000,30000,99000]})

df

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000
1,Emma,,52000.0,43000.0,?
2,Sofia,East,90000.0,,?
3,Markus,South,,44000.0,
4,Edward,West,42000.0,38000.0,11000
5,Thomas,West,72000.0,39000.0,33000
6,Ethan,South,49000.0,42000.0,44000
7,Olivia,West,,,10000
8,Arun,West,67000.0,39000.0,22000
9,Anika,East,65000.0,44000.0,30000


In [203]:
df['expenses']

0     42000.0
1     43000.0
2         NaN
3     44000.0
4     38000.0
5     39000.0
6     42000.0
7         NaN
8     39000.0
9     44000.0
10    45000.0
Name: expenses, dtype: float64

In [204]:
df['lossValues']

0     12000
1         ?
2         ?
3       NaN
4     11000
5     33000
6     44000
7     10000
8     22000
9     30000
10    99000
Name: lossValues, dtype: object

## Information about data

In [205]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        11 non-null     object 
 1   region      10 non-null     object 
 2   sales       9 non-null      float64
 3   expenses    9 non-null      float64
 4   lossValues  10 non-null     object 
dtypes: float64(2), object(3)
memory usage: 568.0+ bytes


## Statistics of columns having float or integer values

In [206]:
df.describe() 

Unnamed: 0,sales,expenses
count,9.0,9.0
mean,61555.555556,41777.777778
std,14808.030854,2538.591035
min,42000.0,38000.0
25%,50000.0,39000.0
50%,65000.0,42000.0
75%,67000.0,44000.0
max,90000.0,45000.0


## Transpose

In [207]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sales,9.0,61555.555556,14808.030854,42000.0,50000.0,65000.0,67000.0,90000.0
expenses,9.0,41777.777778,2538.591035,38000.0,39000.0,42000.0,44000.0,45000.0


In [208]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sales,9.0,61555.555556,14808.030854,42000.0,50000.0,65000.0,67000.0,90000.0
expenses,9.0,41777.777778,2538.591035,38000.0,39000.0,42000.0,44000.0,45000.0


## Finding missing or null values

In [209]:
df

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000
1,Emma,,52000.0,43000.0,?
2,Sofia,East,90000.0,,?
3,Markus,South,,44000.0,
4,Edward,West,42000.0,38000.0,11000
5,Thomas,West,72000.0,39000.0,33000
6,Ethan,South,49000.0,42000.0,44000
7,Olivia,West,,,10000
8,Arun,West,67000.0,39000.0,22000
9,Anika,East,65000.0,44000.0,30000


In [210]:
df.isnull().any()

name          False
region         True
sales          True
expenses       True
lossValues     True
dtype: bool

In [211]:
df.isnull().sum()

name          0
region        1
sales         2
expenses      2
lossValues    1
dtype: int64

## Replacing "?" with a null value

In [212]:
df.replace(to_replace ='?', value = np.nan, inplace = True)

In [213]:
df

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000.0
1,Emma,,52000.0,43000.0,
2,Sofia,East,90000.0,,
3,Markus,South,,44000.0,
4,Edward,West,42000.0,38000.0,11000.0
5,Thomas,West,72000.0,39000.0,33000.0
6,Ethan,South,49000.0,42000.0,44000.0
7,Olivia,West,,,10000.0
8,Arun,West,67000.0,39000.0,22000.0
9,Anika,East,65000.0,44000.0,30000.0


## Creating copy of df

In [214]:
df1 = df.copy()
df1

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000.0
1,Emma,,52000.0,43000.0,
2,Sofia,East,90000.0,,
3,Markus,South,,44000.0,
4,Edward,West,42000.0,38000.0,11000.0
5,Thomas,West,72000.0,39000.0,33000.0
6,Ethan,South,49000.0,42000.0,44000.0
7,Olivia,West,,,10000.0
8,Arun,West,67000.0,39000.0,22000.0
9,Anika,East,65000.0,44000.0,30000.0


## Fill NaN with a value = 1000

In [215]:
df1.fillna(1000)

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000.0
1,Emma,1000,52000.0,43000.0,1000.0
2,Sofia,East,90000.0,1000.0,1000.0
3,Markus,South,1000.0,44000.0,1000.0
4,Edward,West,42000.0,38000.0,11000.0
5,Thomas,West,72000.0,39000.0,33000.0
6,Ethan,South,49000.0,42000.0,44000.0
7,Olivia,West,1000.0,1000.0,10000.0
8,Arun,West,67000.0,39000.0,22000.0
9,Anika,East,65000.0,44000.0,30000.0


In [216]:
df1

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000.0
1,Emma,,52000.0,43000.0,
2,Sofia,East,90000.0,,
3,Markus,South,,44000.0,
4,Edward,West,42000.0,38000.0,11000.0
5,Thomas,West,72000.0,39000.0,33000.0
6,Ethan,South,49000.0,42000.0,44000.0
7,Olivia,West,,,10000.0
8,Arun,West,67000.0,39000.0,22000.0
9,Anika,East,65000.0,44000.0,30000.0


## Fill the Nan Values of df1['region'] with a word 'NEWS'

In [217]:
df1['region'].fillna(value = 'NEWS',inplace = True)

In [218]:
df1

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000.0
1,Emma,NEWS,52000.0,43000.0,
2,Sofia,East,90000.0,,
3,Markus,South,,44000.0,
4,Edward,West,42000.0,38000.0,11000.0
5,Thomas,West,72000.0,39000.0,33000.0
6,Ethan,South,49000.0,42000.0,44000.0
7,Olivia,West,,,10000.0
8,Arun,West,67000.0,39000.0,22000.0
9,Anika,East,65000.0,44000.0,30000.0


## Fill the NaN values of column sales with the mean of column, expenses with median of
## column expenses and loss value with the standard deviation of the column loss values

In [219]:
df1.fillna(value = {'sales' : df1['sales'].mean(), 'expenses' : df1['expenses'].median(), 'lossValues' : df1['lossValues'].std()}, inplace = True )

In [220]:
df1

Unnamed: 0,name,region,sales,expenses,lossValues
0,William,East,50000.0,42000.0,12000.0
1,Emma,NEWS,52000.0,43000.0,29422.719598
2,Sofia,East,90000.0,42000.0,29422.719598
3,Markus,South,61555.555556,44000.0,29422.719598
4,Edward,West,42000.0,38000.0,11000.0
5,Thomas,West,72000.0,39000.0,33000.0
6,Ethan,South,49000.0,42000.0,44000.0
7,Olivia,West,61555.555556,42000.0,10000.0
8,Arun,West,67000.0,39000.0,22000.0
9,Anika,East,65000.0,44000.0,30000.0


## Finding Unique values in a column

In [221]:
df1['sales'].unique()

array([50000.        , 52000.        , 90000.        , 61555.55555556,
       42000.        , 72000.        , 49000.        , 67000.        ,
       65000.        ])

## Finding length of the columns

In [222]:
len(df1['sales'].unique())

9

## Frequency of a value in a column

In [223]:
df1['sales'].value_counts()

61555.555556    2
67000.000000    2
50000.000000    1
52000.000000    1
90000.000000    1
42000.000000    1
72000.000000    1
49000.000000    1
65000.000000    1
Name: sales, dtype: int64

## Reading a csv file

In [224]:
data = pd.read_csv('Data.csv')

In [225]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [226]:
%cd /content/drive/MyDrive/Course Material/Introduction/Python crash course

/content/drive/MyDrive/Course Material/Introduction/Python crash course


In [227]:
data.head()

Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt
0,0,1424696638740,27920678471000,-0.565032,-9.572019,-0.614113,a,gear,gear_1,stand
1,1,1424696638740,27920681910000,-0.832584,-9.713276,-0.60693,a,gear,gear_1,stand
2,2,1424696638740,27920692014000,-1.018134,-9.935339,-0.544082,a,gear,gear_1,stand
3,3,1424696638741,27920701983000,-1.222838,-10.142437,-0.566229,a,gear,gear_1,stand
4,4,1424696638741,27920711906000,-1.57718,-10.480618,-0.402824,a,gear,gear_1,stand
