## First steps on Pandas
How to install Pandas step by step - https://pandas.pydata.org/pandas-docs/stable/install.html





In [92]:
#Import pandas library to use it in the following steps
import pandas as pd
import numpy as np
# We create our data frame reading a file, comma is the default separator
dataFrame = pd.read_csv('forestfires.csv')

In [76]:
print("Description of an index : ")
dataFrame.index

Description of an index : 


RangeIndex(start=0, stop=517, step=1)

In [77]:
print("Description of a column : ")
dataFrame.columns

Description of a column : 


Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')

In [78]:
print("Description of data types : ")
dataFrame.dtypes

Description of data types : 


X          int64
Y          int64
month     object
day       object
FFMC     float64
DMC      float64
DC       float64
ISI      float64
temp     float64
RH         int64
wind     float64
rain     float64
area     float64
dtype: object

In [79]:
print("A summary description of a data frame")
dataFrame.info()

A summary description of a data frame
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
X        517 non-null int64
Y        517 non-null int64
month    517 non-null object
day      517 non-null object
FFMC     517 non-null float64
DMC      517 non-null float64
DC       517 non-null float64
ISI      517 non-null float64
temp     517 non-null float64
RH       517 non-null int64
wind     517 non-null float64
rain     517 non-null float64
area     517 non-null float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


.loc indexer used to select based on label (it is an attribute)

In [80]:
dataFrame.loc[2]

X            7
Y            4
month      oct
day        sat
FFMC      90.6
DMC       43.7
DC       686.9
ISI        6.7
temp      14.6
RH          33
wind       1.3
rain         0
area         0
Name: 2, dtype: object

In [81]:
dataFrame.loc[[3,56,58]]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
56,4,3,sep,tue,90.3,80.7,730.2,6.3,17.8,63,4.9,0.0,0.0
58,2,2,feb,mon,84.0,9.3,34.0,2.1,13.9,40,5.4,0.0,0.0


.iloc indexer only by integer location - as Python list (also an attribute)

In [82]:
dataFrame.iloc[[3,56]]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
56,4,3,sep,tue,90.3,80.7,730.2,6.3,17.8,63,4.9,0.0,0.0


In [83]:
print("Mean temperature of the dataset {}".format(dataFrame['temp'].mean()) )
print("Description of a new data frame from 'month' column: ")
monthDF = dataFrame['month']
monthDF.head()

Mean temperature of the dataset 18.88916827852998
Description of a new data frame from 'month' column: 


0    mar
1    oct
2    oct
3    mar
4    mar
Name: month, dtype: object

In [84]:
print("Type of the data frame {}".format(type(monthDF)))


Type of the data frame <class 'pandas.core.series.Series'>


Pandas let us explore different kind of files and shape them to our cover our necessities. So we are going to work with a dataset about pima indians with diabetes. 


In [85]:
pimaData = pd.read_csv('pima-indians-diabetes.data.txt',
                       sep=",", 
                       header=None, 
                       names=['qtyPregnant','plaGlucose','diastolic','tricepThick','serInsulin','bodyMass','diaPedigree','age','variable'])

In [86]:
pimaData.head()

Unnamed: 0,qtyPregnant,plaGlucose,diastolic,tricepThick,serInsulin,bodyMass,diaPedigree,age,variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [87]:
#Query help us to create selections 
zeroAge = pimaData.query("age == 0 ").shape[0]
print("Number of  people with zero age {}".format(zeroAge))

Number of  people with zero age 0


In [88]:
zeroMass = pimaData.query("bodyMass == 0 ").shape[0]
print("Number of  people with zero bodyMass {}".format(zeroMass))

Number of  people with zero bodyMass 11


In [89]:
zeroDias = pimaData.query("diastolic == 0 ").shape[0]
print("Number of  people with zero bodyMass {}".format(zeroDias))

Number of  people with zero bodyMass 35


In [96]:
pimaData.describe()

Unnamed: 0,qtyPregnant,plaGlucose,diastolic,tricepThick,serInsulin,bodyMass,diaPedigree,age,variable
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [90]:
from sklearn.model_selection import train_test_split

pimaTrain, pimaTest = train_test_split(pimaData, test_size=0.3)

print("Type of the pimaTrain {}".format(type(pimaTrain)))

Type of the pimaTrain <class 'pandas.core.frame.DataFrame'>


In [93]:
y = np.array(pimaTrain["variable"])
x = np.array(pimaTrain.drop(["variable"], axis = 1))
print(x)

[[  3.00000000e+00   1.21000000e+02   5.20000000e+01 ...,   3.60000000e+01
    1.27000000e-01   2.50000000e+01]
 [  5.00000000e+00   1.06000000e+02   8.20000000e+01 ...,   3.95000000e+01
    2.86000000e-01   3.80000000e+01]
 [  5.00000000e+00   1.32000000e+02   8.00000000e+01 ...,   2.68000000e+01
    1.86000000e-01   6.90000000e+01]
 ..., 
 [  1.00000000e+00   1.08000000e+02   8.80000000e+01 ...,   2.71000000e+01
    4.00000000e-01   2.40000000e+01]
 [  2.00000000e+00   1.12000000e+02   7.80000000e+01 ...,   3.94000000e+01
    1.75000000e-01   2.40000000e+01]
 [  1.00000000e+00   1.14000000e+02   6.60000000e+01 ...,   3.81000000e+01
    2.89000000e-01   2.10000000e+01]]


In [94]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(x, y)

In [95]:
clf.score(x, y)

0.98510242085661082

In [99]:
meanMass = pimaData['bodyMass'].mean()
print("Mean mass of pima women {}".format(meanMass))

Mean mass of pima women 31.992578124999977


In [101]:
#This won't work 
pimaData.bodyMass.fillna(meanMass)
pimaData.describe()

Unnamed: 0,qtyPregnant,plaGlucose,diastolic,tricepThick,serInsulin,bodyMass,diaPedigree,age,variable
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [102]:
pimaData=pimaData.replace({'bodyMass': {0: meanMass}}) 
pimaData.describe()

Unnamed: 0,qtyPregnant,plaGlucose,diastolic,tricepThick,serInsulin,bodyMass,diaPedigree,age,variable
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,32.450805,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,6.875374,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [103]:
y = np.array(pimaTrain["variable"])
x = np.array(pimaTrain.drop(["variable"], axis = 1))
print(x)

[[  3.00000000e+00   1.21000000e+02   5.20000000e+01 ...,   3.60000000e+01
    1.27000000e-01   2.50000000e+01]
 [  5.00000000e+00   1.06000000e+02   8.20000000e+01 ...,   3.95000000e+01
    2.86000000e-01   3.80000000e+01]
 [  5.00000000e+00   1.32000000e+02   8.00000000e+01 ...,   2.68000000e+01
    1.86000000e-01   6.90000000e+01]
 ..., 
 [  1.00000000e+00   1.08000000e+02   8.80000000e+01 ...,   2.71000000e+01
    4.00000000e-01   2.40000000e+01]
 [  2.00000000e+00   1.12000000e+02   7.80000000e+01 ...,   3.94000000e+01
    1.75000000e-01   2.40000000e+01]
 [  1.00000000e+00   1.14000000e+02   6.60000000e+01 ...,   3.81000000e+01
    2.89000000e-01   2.10000000e+01]]


In [104]:
clf = clf.fit(x, y)
clf.score(x, y)

0.98137802607076352