## working with pandas & numpy library 

Pandas is a popular Python library for data manipulation and analysis. It provides data structures like DataFrames for organizing tabular data, and Series for one-dimensional data. Pandas simplifies data handling tasks by offering powerful tools for data cleaning, exploration, and transformation, making it a go-to choice for data scientists and analysts.

NumPy is a fundamental Python library for numerical operations. It introduces arrays as its core data structure, enabling efficient mathematical and statistical computations. NumPy is essential for tasks like linear algebra, array manipulation, and scientific computing, serving as the foundation for various data science and machine learning libraries.

### but , i prefer H20 data framework more , as i use h20 for model devlopment like h20.GBM , h20.RandomForest , etc 
### due to H20 df i am able to create good models and later iterate them to increase performance

In [1]:
import pandas as pd

In [2]:
marks = [80,90,100,40]
labels = ["maths","science","physics","chem"]

In [3]:
series = pd.Series(marks,labels)

In [4]:
series

maths       80
science     90
physics    100
chem        40
dtype: int64

In [5]:
series = pd.Series(data = marks,index = labels)

In [6]:
series

maths       80
science     90
physics    100
chem        40
dtype: int64

In [7]:
type(series)

pandas.core.series.Series

In [8]:
series['maths']

80

In [9]:
import numpy as np
p = np.arange(16).reshape(4,4)
labels = ['a','b','c','d']
label2 = ['A','B','C','D']

In [10]:
p

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [11]:
df = pd.DataFrame(p,index = labels,columns = label2)


In [12]:
df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [13]:
df1 = pd.DataFrame([[1,2,3,4],[4,5,6,7],[8,9,10,11],[12,13,14,15,]], index = "A B C D".split(),columns = "a b c d".split())

In [14]:
"A B C D".split()

['A', 'B', 'C', 'D']

In [15]:
"a b c d".split()

['a', 'b', 'c', 'd']

In [16]:
df1

Unnamed: 0,a,b,c,d
A,1,2,3,4
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [17]:
df1[['b','d']]

Unnamed: 0,b,d
A,2,4
B,5,7
C,9,11
D,13,15


In [18]:
df1['a']

A     1
B     4
C     8
D    12
Name: a, dtype: int64

In [19]:
df1.loc['C']

a     8
b     9
c    10
d    11
Name: C, dtype: int64

In [20]:
df1.iloc[0]

a    1
b    2
c    3
d    4
Name: A, dtype: int64

In [21]:
df1['new'] =df1['a']+df1['b']

In [22]:
df1

Unnamed: 0,a,b,c,d,new
A,1,2,3,4,3
B,4,5,6,7,9
C,8,9,10,11,17
D,12,13,14,15,25


In [23]:
df1['new2'] = [1,2,4,6]

In [24]:
df1

Unnamed: 0,a,b,c,d,new,new2
A,1,2,3,4,3,1
B,4,5,6,7,9,2
C,8,9,10,11,17,4
D,12,13,14,15,25,6


In [25]:
df1.drop('new2',axis =1)

Unnamed: 0,a,b,c,d,new
A,1,2,3,4,3
B,4,5,6,7,9
C,8,9,10,11,17
D,12,13,14,15,25


In [26]:
df1

Unnamed: 0,a,b,c,d,new,new2
A,1,2,3,4,3,1
B,4,5,6,7,9,2
C,8,9,10,11,17,4
D,12,13,14,15,25,6


In [27]:
df1.drop('new2',axis = 1,inplace =  True)

In [28]:
df1

Unnamed: 0,a,b,c,d,new
A,1,2,3,4,3
B,4,5,6,7,9
C,8,9,10,11,17
D,12,13,14,15,25


In [29]:
df1

Unnamed: 0,a,b,c,d,new
A,1,2,3,4,3
B,4,5,6,7,9
C,8,9,10,11,17
D,12,13,14,15,25


In [30]:
df1

Unnamed: 0,a,b,c,d,new
A,1,2,3,4,3
B,4,5,6,7,9
C,8,9,10,11,17
D,12,13,14,15,25


In [31]:
df1.reset_index()

Unnamed: 0,index,a,b,c,d,new
0,A,1,2,3,4,3
1,B,4,5,6,7,9
2,C,8,9,10,11,17
3,D,12,13,14,15,25


In [32]:
newind = "CA NY  WY DR".split()

In [33]:
newind

['CA', 'NY', 'WY', 'DR']

In [34]:
df1

Unnamed: 0,a,b,c,d,new
A,1,2,3,4,3
B,4,5,6,7,9
C,8,9,10,11,17
D,12,13,14,15,25


In [35]:
df1["newindex"] = newind

In [36]:
df1

Unnamed: 0,a,b,c,d,new,newindex
A,1,2,3,4,3,CA
B,4,5,6,7,9,NY
C,8,9,10,11,17,WY
D,12,13,14,15,25,DR


In [37]:
df1.set_index("newindex",inplace  = True)

In [38]:
df1

Unnamed: 0_level_0,a,b,c,d,new
newindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,1,2,3,4,3
NY,4,5,6,7,9
WY,8,9,10,11,17
DR,12,13,14,15,25


In [39]:
df1.head(2) # will return first 5 rows of a dataet

Unnamed: 0_level_0,a,b,c,d,new
newindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,1,2,3,4,3
NY,4,5,6,7,9


In [40]:
df1.tail(2)

Unnamed: 0_level_0,a,b,c,d,new
newindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WY,8,9,10,11,17
DR,12,13,14,15,25


In [41]:
df1['a'].unique()

array([ 1,  4,  8, 12])

In [42]:
df1['a'].value_counts()

a
1     1
4     1
8     1
12    1
Name: count, dtype: int64

In [43]:
df1.describe()

Unnamed: 0,a,b,c,d,new
count,4.0,4.0,4.0,4.0,4.0
mean,6.25,7.25,8.25,9.25,13.5
std,4.787136,4.787136,4.787136,4.787136,9.574271
min,1.0,2.0,3.0,4.0,3.0
25%,3.25,4.25,5.25,6.25,7.5
50%,6.0,7.0,8.0,9.0,13.0
75%,9.0,10.0,11.0,12.0,19.0
max,12.0,13.0,14.0,15.0,25.0


In [44]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, CA to DR
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       4 non-null      int64
 1   b       4 non-null      int64
 2   c       4 non-null      int64
 3   d       4 non-null      int64
 4   new     4 non-null      int64
dtypes: int64(5)
memory usage: 192.0+ bytes


In [45]:
df = pd.DataFrame({'A':[1,2,np.NaN],
                  'B':[5,np.NaN,np.NaN],
                  'C':[1,2,3]})

In [46]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [47]:
df.isnull().any

<bound method NDFrame._add_numeric_operations.<locals>.any of        A      B      C
0  False  False  False
1  False   True  False
2   True   True  False>

In [48]:
df.isnull().any()

A     True
B     True
C    False
dtype: bool

In [49]:
df.isnull().sum()

A    1
B    2
C    0
dtype: int64

In [50]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [51]:
df.dropna(axis = 1)

Unnamed: 0,C
0,1
1,2
2,3


In [52]:
df.dropna(thresh = 2,axis=1)

Unnamed: 0,A,C
0,1.0,1
1,2.0,2
2,,3


In [53]:
df.fillna(value= 60)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,60.0,2
2,60.0,60.0,3


In [54]:
df['A'].fillna(df["A"].mean(),inplace = True)

In [55]:
df['B'].fillna(df["B"].mean(),inplace = True)

In [56]:
df['B'].fillna(df["B"].median(),inplace = True)

In [57]:
p = df['A'].mode()

In [58]:
p

0    1.0
1    1.5
2    2.0
Name: A, dtype: float64

In [59]:
p[1]

1.5

In [60]:
df['A'].fillna(df["A"].mode()[0])

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [61]:
df1 = pd.DataFrame(["Aindia" , np.nan, "Aindia","Australia","Australia","us","canada"],columns = ['country'])

In [62]:
df1

Unnamed: 0,country
0,Aindia
1,
2,Aindia
3,Australia
4,Australia
5,us
6,canada


In [63]:
q = df1['country'].mode()

In [64]:
q[0]

'Aindia'

In [65]:
df1['country'].fillna(df1['country'].mode()[1])

0       Aindia
1    Australia
2       Aindia
3    Australia
4    Australia
5           us
6       canada
Name: country, dtype: object

In [66]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


mean- ideal condition is if colums data is in  range -  age (20,80)
median()- lot of variation salary (1000 to 1000000)
mode  - for textual data

create a  dataframe with 10 rows anf seven colums two colums should be textual
in that make some missing values in four of the colums one in textual colum
apply all the functions which are taugut

In [81]:
data = np.arange(16).reshape(4, 4)
columns = "p q r s".split()
df2 = pd.DataFrame(data, columns=columns)

In [82]:
df2

Unnamed: 0,p,q,r,s
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [83]:
"a b c d".split()

['a', 'b', 'c', 'd']

In [85]:
import pandas as pd
import numpy as np

data = np.arange(16).reshape(4, 4)
index_list = ["a", "b", "c", "d"]  
columns_list = ["p", "q", "r", "s"]  

df2 = pd.DataFrame(data, index=index_list, columns=columns_list)


In [86]:
newindex = ["row1","row2","row3","row4"]

In [87]:
df2["newcolumn"] = newindex

In [88]:
df2

Unnamed: 0,p,q,r,s,newcolumn
a,0,1,2,3,row1
b,4,5,6,7,row2
c,8,9,10,11,row3
d,12,13,14,15,row4


In [89]:
df2.set_index("newcolumn",inplace = True)

In [90]:
df2 

Unnamed: 0_level_0,p,q,r,s
newcolumn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15


In [69]:
data = pd.DataFrame({'1':  ['h', 'e','l','l','o',np.NaN,'n','a','i','t'],
        '2':  ['h', 'e','l','l','o','n','a','i',np.NaN,'t'],
        '3':  ['h', 'e','l','l','o',np.NaN,'n','a','i','t'],
        '4':  ['h', 'e','l','l','o',np.NaN,'n','a','i','t'],
        '5':  ['h', 'e',np.NaN,'l','o','n','a','i','t','k'],
        '6':  ['h',np.NaN,'l','l','o','k','n','a','i','t'],
        '7':  ['h', 'e','l','l','o',np.NaN,'n','a','i','t']})
data



Unnamed: 0,1,2,3,4,5,6,7
0,h,h,h,h,h,h,h
1,e,e,e,e,e,,e
2,l,l,l,l,,l,l
3,l,l,l,l,l,l,l
4,o,o,o,o,o,o,o
5,,n,,,n,k,
6,n,a,n,n,a,n,n
7,a,i,a,a,i,a,a
8,i,,i,i,t,i,i
9,t,t,t,t,k,t,t


In [70]:
data['1'].mode()

0    l
Name: 1, dtype: object

In [71]:
data['1'].fillna(data['1'].mode()[0],inplace = True)
data['2'].fillna(data['2'].mode()[0],inplace = True)
data['3'].fillna(data['3'].mode()[0],inplace = True)

In [72]:
data

Unnamed: 0,1,2,3,4,5,6,7
0,h,h,h,h,h,h,h
1,e,e,e,e,e,,e
2,l,l,l,l,,l,l
3,l,l,l,l,l,l,l
4,o,o,o,o,o,o,o
5,l,n,l,,n,k,
6,n,a,n,n,a,n,n
7,a,i,a,a,i,a,a
8,i,l,i,i,t,i,i
9,t,t,t,t,k,t,t


In [73]:
d3 = df

In [74]:
d3

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


In [75]:
d3 = pd.DataFrame({'A':[1,2,np.NaN],
                  'B':[5,np.NaN,np.NaN],
                  'C':[1,2,3]})

In [76]:
d3

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [77]:
d3['A'].fillna(d3['A'].mean(),inplace = True)

In [78]:
d3['B'].fillna(d3['B'].median(),inplace = True)

In [79]:
d3

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


## working with numpy

In [91]:
import numpy as np

In [92]:
list =[1,2,3]

In [93]:
type(list)

list

In [95]:
arr =np.array(list)

In [96]:
arr

array([1, 2, 3])

In [97]:
type(arr)

numpy.ndarray

In [100]:
print(arr.shape)
print(arr.ndim)

(3,)
1


In [101]:
arr2 = np.array([3,4,5,6,7,8])

In [102]:
arr2

array([3, 4, 5, 6, 7, 8])

In [103]:
arr2[3:4]

array([6])