## pandas
* pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

* Two important data types defined by pandas are Series and DataFrame.
* You can think of a Series as a “column” of data, such as a collection of observations on a single variable. A DataFrame is an object for storing related columns of data.

In [5]:
# importing pandas library
import pandas as pd

In [6]:
# creating a sample dictionary
sample_dict = { 'name' : ["a", "b", "c", "d", "e"],
               'age' : [42, 18, 26, 22, 23],
              'designation' :["CEO", "MD", "VP", "CEO", "CFO"]}

* There are several ways to create a DataFrame. One way way is to use a dictionary.

In [7]:
df1 = pd.DataFrame(sample_dict)

* Pandas DataFrame is a 2-D labeled data structure with columns of potentially different type.

In [8]:
df1

Unnamed: 0,name,age,designation
0,a,42,CEO
1,b,18,MD
2,c,26,VP
3,d,22,CEO
4,e,23,CFO


* data can be also imported from various methods like using csv file, excel file, using SQL query, parsing html page and othe means, as well can be exported to various file types

Let's export the df1 to a csv file

In [9]:
df1.to_csv('file.csv')

In [10]:
# loading data from csv file we just saved above
df2 = pd.read_csv('file.csv')
df2

Unnamed: 0.1,Unnamed: 0,name,age,designation
0,0,a,42,CEO
1,1,b,18,MD
2,2,c,26,VP
3,3,d,22,CEO
4,4,e,23,CFO


In [11]:
# importing a data to use
from sklearn import datasets as dset
h_data = dset.fetch_california_housing()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/pranjay/scikit_learn_data


In [15]:
# names of predictors in dataset
print(h_data.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [16]:
# shape of data of predictor variables
print(h_data.data.shape)

# shape of data of target variable
print(h_data.target.shape)

(20640, 8)
(20640,)


In [18]:
cal_housing = pd.DataFrame(h_data.data)

In [19]:
cal_housing

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
7,3.1200,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25


## Renaming column in the data frame

In [20]:
cal_housing.columns = h_data.feature_names

In [21]:
cal_housing

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
7,3.1200,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25


# Viewing or Inspecting data

In [22]:
# head(n) is used to return top n rows of the dataset 
# default value of n is equal to 5
cal_housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [23]:
# similarly tail(n) is used to return last n rows of the dataset 
# default value of n is equal to 5
cal_housing.tail(7)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
20633,2.5495,27.0,5.445026,1.078534,1082.0,2.832461,39.19,-121.53
20634,3.7125,28.0,6.77907,1.148256,1041.0,3.026163,39.27,-121.56
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24


In [24]:
# shape return the number of rown and column in form of a tuple
cal_housing.shape

(20640, 8)

In [25]:
# info() returns index, Datatype and memory information 
cal_housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
MedInc        20640 non-null float64
HouseAge      20640 non-null float64
AveRooms      20640 non-null float64
AveBedrms     20640 non-null float64
Population    20640 non-null float64
AveOccup      20640 non-null float64
Latitude      20640 non-null float64
Longitude     20640 non-null float64
dtypes: float64(8)
memory usage: 1.3 MB


In [26]:
# Dataframe.Series.value_counts() returns unique values and counts
# where Series is a coulumn of Dataframe as a column in a dataframe is
# a Series DataType

# returns count of unique values in MedInc column
cal_housing.MedInc.value_counts()

15.0001    49
3.1250     49
2.8750     46
2.6250     44
4.1250     44
3.8750     41
3.3750     38
3.0000     38
4.0000     37
3.6250     37
4.3750     35
2.1250     33
2.3750     32
4.6250     31
3.5000     30
2.2500     29
4.8750     29
3.2500     29
1.6250     29
3.7500     29
2.5000     28
4.2500     28
3.6875     26
2.7500     25
4.5000     24
2.5625     21
1.8750     21
5.0000     20
3.0625     19
3.3125     18
           ..
5.7780      1
1.3672      1
1.5735      1
7.0245      1
2.5389      1
6.2113      1
6.0591      1
2.1403      1
1.5161      1
1.9306      1
4.0517      1
2.5599      1
5.6263      1
5.2649      1
4.2775      1
6.7744      1
4.0677      1
2.1216      1
4.1449      1
6.0808      1
3.5082      1
6.7079      1
4.3812      1
6.6833      1
7.3031      1
2.7209      1
5.1230      1
1.2614      1
2.0294      1
4.6429      1
Name: MedInc, Length: 12928, dtype: int64

## Selection

In [27]:
# Selecting rows using their index
cal_housing[2:5]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [28]:
# returns a Series data type or a single selected column

print(type(cal_housing['HouseAge']))
cal_housing['HouseAge'].shape

<class 'pandas.core.series.Series'>


(20640,)

In [29]:
# To select columns, we can pass a list containing the
# names of the desired columns represented as strings

cal_housing[ ['HouseAge', 'Latitude', 'Longitude']].head(10)

Unnamed: 0,HouseAge,Latitude,Longitude
0,41.0,37.88,-122.23
1,21.0,37.86,-122.22
2,52.0,37.85,-122.24
3,52.0,37.85,-122.25
4,52.0,37.85,-122.25
5,52.0,37.85,-122.25
6,52.0,37.84,-122.25
7,52.0,37.84,-122.25
8,42.0,37.84,-122.26
9,52.0,37.84,-122.25


In [30]:
# To select rows and columns using index

cal_housing.iloc[2:6, 3:5]

Unnamed: 0,AveBedrms,Population
2,1.073446,496.0
3,1.073059,558.0
4,1.081081,565.0
5,1.103627,413.0


In [31]:
# To select rows and and columns using a mixture of integers and labels,
# the loc attribute can be used in a similar way

cal_housing.loc[cal_housing.index[2:6], ['AveBedrms', 'Population']]

Unnamed: 0,AveBedrms,Population
2,1.073446,496.0
3,1.073059,558.0
4,1.081081,565.0
5,1.103627,413.0


## Statistics

In [32]:
# describe() summary statistics of numerical columns
cal_housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [33]:
# mean() return the mean of all columns
cal_housing.mean()

MedInc           3.870671
HouseAge        28.639486
AveRooms         5.429000
AveBedrms        1.096675
Population    1425.476744
AveOccup         3.070655
Latitude        35.631861
Longitude     -119.569704
dtype: float64

In [34]:
# corr() return the correlation between columns in dataframe
cal_housing.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0


In [35]:
# count() return the number of non null values in each dataframe column
cal_housing.count()

MedInc        20640
HouseAge      20640
AveRooms      20640
AveBedrms     20640
Population    20640
AveOccup      20640
Latitude      20640
Longitude     20640
dtype: int64

In [36]:
# max() and min() return the maximum and minimum values from
#each column in a dataframe respectively
print(cal_housing.max())
print(cal_housing.min())

MedInc           15.000100
HouseAge         52.000000
AveRooms        141.909091
AveBedrms        34.066667
Population    35682.000000
AveOccup       1243.333333
Latitude         41.950000
Longitude      -114.310000
dtype: float64
MedInc          0.499900
HouseAge        1.000000
AveRooms        0.846154
AveBedrms       0.333333
Population      3.000000
AveOccup        0.692308
Latitude       32.540000
Longitude    -124.350000
dtype: float64


In [37]:
# median() returns the median of each column
cal_housing.median()

MedInc           3.534800
HouseAge        29.000000
AveRooms         5.229129
AveBedrms        1.048780
Population    1166.000000
AveOccup         2.818116
Latitude        34.260000
Longitude     -118.490000
dtype: float64

In [38]:
# std() return standard deviation of each columns
cal_housing.std()

MedInc           1.899822
HouseAge        12.585558
AveRooms         2.474173
AveBedrms        0.473911
Population    1132.462122
AveOccup        10.386050
Latitude         2.135952
Longitude        2.003532
dtype: float64

## Data Cleaning

In [40]:
# loading titanic passenger survival dataset

tt = pd.read_csv('train.csv')
tt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [48]:
# changing index to column name
# using inplace = True to get update in original dataframe object
tt.set_index('Name', inplace = True)

In [49]:
tt

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1000,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
"McCarthy, Mr. Timothy J",7,0,1,male,54.0,0,0,17463,51.8625,E46,S
"Palsson, Master. Gosta Leonard",8,0,3,male,2.0,3,1,349909,21.0750,,S
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,female,27.0,0,2,347742,11.1333,,S
"Nasser, Mrs. Nicholas (Adele Achem)",10,1,2,female,14.0,1,0,237736,30.0708,,C


In [50]:
# renaming a column

tt.rename(columns = {'Pclass':'PassangerClass'}, inplace = True)
tt

Unnamed: 0_level_0,PassengerId,Survived,PassangerClass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1000,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
"McCarthy, Mr. Timothy J",7,0,1,male,54.0,0,0,17463,51.8625,E46,S
"Palsson, Master. Gosta Leonard",8,0,3,male,2.0,3,1,349909,21.0750,,S
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,female,27.0,0,2,347742,11.1333,,S
"Nasser, Mrs. Nicholas (Adele Achem)",10,1,2,female,14.0,1,0,237736,30.0708,,C


In [54]:
# check null entries present using isnull()

tt.isnull().sum()

PassengerId         0
Survived            0
PassangerClass      0
Sex                 0
Age               177
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin             687
Embarked            2
dtype: int64

In [57]:
# return data type of each series
tt.dtypes

PassengerId         int64
Survived            int64
PassangerClass      int64
Sex                object
Age               float64
SibSp               int64
Parch               int64
Ticket             object
Fare              float64
Cabin              object
Embarked           object
dtype: object

In [60]:
# filling null values of Age column with median of Age
tt.Age.fillna(tt.Age.median(), inplace = True)
tt.isnull().sum()

PassengerId         0
Survived            0
PassangerClass      0
Sex                 0
Age                 0
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin             687
Embarked            2
dtype: int64

In [74]:
# return rows where Passenger belongs to 3rd Class
tt[ (tt['PassangerClass'] == 3) ].head(10)

Unnamed: 0_level_0,PassengerId,Survived,PassangerClass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S
"Moran, Mr. James",6,0,3,male,28.0,0,0,330877,8.4583,,Q
"Palsson, Master. Gosta Leonard",8,0,3,male,2.0,3,1,349909,21.075,,S
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,female,27.0,0,2,347742,11.1333,,S
"Sandstrom, Miss. Marguerite Rut",11,1,3,female,4.0,1,1,PP 9549,16.7,G6,S
"Saundercock, Mr. William Henry",13,0,3,male,20.0,0,0,A/5. 2151,8.05,,S
"Andersson, Mr. Anders Johan",14,0,3,male,39.0,1,5,347082,31.275,,S
"Vestrom, Miss. Hulda Amanda Adolfina",15,0,3,female,14.0,0,0,350406,7.8542,,S


In [75]:
# return rows where Age is less than 40 and greater than 18
tt[(tt['Age'] < 40) & (tt['Age'] > 18)].head(10)

Unnamed: 0_level_0,PassengerId,Survived,PassangerClass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S
"Moran, Mr. James",6,0,3,male,28.0,0,0,330877,8.4583,,Q
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,female,27.0,0,2,347742,11.1333,,S
"Saundercock, Mr. William Henry",13,0,3,male,20.0,0,0,A/5. 2151,8.05,,S
"Andersson, Mr. Anders Johan",14,0,3,male,39.0,1,5,347082,31.275,,S
"Williams, Mr. Charles Eugene",18,1,2,male,28.0,0,0,244373,13.0,,S


In [76]:
# sort all the rows with age in acending order
tt.sort_values('Age')

Unnamed: 0_level_0,PassengerId,Survived,PassangerClass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Thomas, Master. Assad Alexander",804,1,3,male,0.42,0,1,2625,8.5167,,C
"Hamalainen, Master. Viljo",756,1,2,male,0.67,1,1,250649,14.5000,,S
"Baclini, Miss. Eugenie",645,1,3,female,0.75,2,1,2666,19.2583,,C
"Baclini, Miss. Helene Barbara",470,1,3,female,0.75,2,1,2666,19.2583,,C
"Richards, Master. George Sibley",832,1,2,male,0.83,1,1,29106,18.7500,,S
"Caldwell, Master. Alden Gates",79,1,2,male,0.83,0,2,248738,29.0000,,S
"Allison, Master. Hudson Trevor",306,1,1,male,0.92,1,2,113781,151.5500,C22 C26,S
"Panula, Master. Eino Viljami",165,0,3,male,1.00,4,1,3101295,39.6875,,S
"Mallet, Master. Andre",828,1,2,male,1.00,0,2,S.C./PARIS 2079,37.0042,,C
"Johnson, Miss. Eleanor Ileen",173,1,3,female,1.00,1,1,347742,11.1333,,S


In [77]:
tt.Sex = tt.Sex.replace(['male', 'female'],['0', '1'])
tt.head()

Unnamed: 0_level_0,PassengerId,Survived,PassangerClass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,0,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,1,35.0,1,0,113803,53.1,C123,S
"Allen, Mr. William Henry",5,0,3,0,35.0,0,0,373450,8.05,,S


In [81]:
# now changing data type of Column Sex to float
tt.Sex = tt.Sex.astype(float)
tt.dtypes

PassengerId         int64
Survived            int64
PassangerClass      int64
Sex               float64
Age               float64
SibSp               int64
Parch               int64
Ticket             object
Fare              float64
Cabin              object
Embarked           object
dtype: object