# Titanic Dataset

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
titanic_train = pd.read_csv('titanic.csv')
titanic_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# first check datatypes object take more space as compare to int and flot
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [9]:
#we can convert any datatype into object by using astype('object')
a = titanic_train.dtypes[titanic_train.dtypes == 'object'].index
a #this will print all the index of object datatypes

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [11]:
titanic_train[a].describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,418,418,418,91,418
unique,418,2,363,76,3
top,"Kelly, Mr. James",male,PC 17608,B57 B59 B63 B66,S
freq,1,266,5,3,270


In [14]:
titanic_train.Sex.value_counts()

male      266
female    152
Name: Sex, dtype: int64

In [15]:
# without taking any value it will only describe int and float type
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [19]:
titanic_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [20]:
req_column = ['PassengerId', 'Survived', 'Pclass', 'Name']
titanic_train[req_column]

Unnamed: 0,PassengerId,Survived,Pclass,Name
0,892,0,3,"Kelly, Mr. James"
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)"
2,894,0,2,"Myles, Mr. Thomas Francis"
3,895,0,3,"Wirz, Mr. Albert"
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf"
414,1306,1,1,"Oliva y Ocana, Dona. Fermina"
415,1307,0,3,"Saether, Mr. Simon Sivertsen"
416,1308,0,3,"Ware, Mr. Frederick"


In [22]:
# just pass parimiter include all and we can use discribe all data types
titanic_train.describe(include = 'all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,,418,2,,,,363,,76,3
top,,,,"Kelly, Mr. James",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,,1,266,,,,5,,3,270
mean,1100.5,0.363636,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.481622,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,0.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,0.0,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,0.0,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,1.0,3.0,,,39.0,1.0,0.0,,31.5,,


In [32]:
# taking first alphabet in cabin column :-
Char_cabin = titanic_train.Cabin.astype(str)

Char_cabin = [char[0] for char in Char_cabin]

Char_cabin = pd.Categorical(Char_cabin)

Char_cabin

['n', 'n', 'n', 'n', 'n', ..., 'n', 'C', 'n', 'n', 'n']
Length: 418
Categories (8, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'n']

In [33]:
titanic_train = titanic_train.drop(['Cabin'], axis = 1)
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [34]:
titanic_train['Cabin'] = Char_cabin
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Cabin
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,n
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,n
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,n
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,n
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,n


In [44]:
[titanic_train[["Fare", "Cabin"]].value_counts("Cabin")]

[Cabin
 n    327
 C     35
 B     18
 D     13
 E      9
 F      8
 A      7
 G      1
 dtype: int64]

In [45]:
lis = ['n','A','B','D','E','F','G']
df1 = titanic_train[titanic_train['Cabin']=='A'].describe()

In [46]:
df1

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,1090.857143,0.428571,1.0,43.0,0.285714,0.285714,44.716657
std,139.752673,0.534522,0.0,8.103497,0.48795,0.48795,25.399971
min,920.0,0.0,1.0,33.0,0.0,0.0,27.7208
25%,971.0,0.0,1.0,37.5,0.0,0.0,29.7
50%,1100.0,0.0,1.0,41.0,0.0,0.0,30.5
75%,1204.0,1.0,1.0,49.0,0.5,0.5,56.76875
max,1266.0,1.0,1.0,54.0,1.0,1.0,81.8583


In [8]:
titanic_train["Age"][titanic_train.Age.isnull()].index

Int64Index([ 10,  22,  29,  33,  36,  39,  41,  47,  54,  58,  65,  76,  83,
             84,  85,  88,  91,  93, 102, 107, 108, 111, 116, 121, 124, 127,
            132, 133, 146, 148, 151, 160, 163, 168, 170, 173, 183, 188, 191,
            199, 200, 205, 211, 216, 219, 225, 227, 233, 243, 244, 249, 255,
            256, 265, 266, 267, 268, 271, 273, 274, 282, 286, 288, 289, 290,
            292, 297, 301, 304, 312, 332, 339, 342, 344, 357, 358, 365, 366,
            380, 382, 384, 408, 410, 413, 416, 417],
           dtype='int64')

In [9]:
ind = titanic_train["Age"][titanic_train.Age.isnull()].index

In [12]:
titanic_train.iloc[ind,:] #[row,col]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,902,0,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
22,914,1,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
29,921,0,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
33,925,1,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.4500,,S
36,928,1,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
408,1300,1,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
410,1302,1,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.7500,,Q
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [17]:
#Alternative way:
np.where(titanic_train["Age"].isnull())

(array([ 10,  22,  29,  33,  36,  39,  41,  47,  54,  58,  65,  76,  83,
         84,  85,  88,  91,  93, 102, 107, 108, 111, 116, 121, 124, 127,
        132, 133, 146, 148, 151, 160, 163, 168, 170, 173, 183, 188, 191,
        199, 200, 205, 211, 216, 219, 225, 227, 233, 243, 244, 249, 255,
        256, 265, 266, 267, 268, 271, 273, 274, 282, 286, 288, 289, 290,
        292, 297, 301, 304, 312, 332, 339, 342, 344, 357, 358, 365, 366,
        380, 382, 384, 408, 410, 413, 416, 417], dtype=int64),)

In [47]:
# To count number of null value:
#titanic_train.isnull().sum()

In [19]:
# number of people who purchase max Fare while Travelling:
np.where(titanic_train["Fare"]==max(titanic_train["Fare"]))

(array([343], dtype=int64),)

In [37]:
i = np.where(titanic_train["Fare"]==max(titanic_train["Fare"]))
titanic_train.iloc[i]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
343,1235,1,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",female,58.0,0,1,PC 17755,512.3292,B51 B53 B55,C


In [27]:
titanic_train.Age.isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
413     True
414    False
415    False
416     True
417     True
Name: Age, Length: 418, dtype: bool

New

In [1]:
import numpy as np
import pandas as pd

In [6]:
df = pd.DataFrame(np.random.randn(4,5), index = ["a","b","c","d"], columns = ["z","y","x","v","u"])
df

Unnamed: 0,z,y,x,v,u
a,0.273513,0.621132,-0.074905,0.669971,-1.529461
b,-1.094482,-0.948156,2.125798,0.570649,0.047209
c,-0.054243,0.968862,0.590426,-0.748999,-1.771406
d,-0.159275,-1.869713,1.899701,-0.843606,-1.151056


In [7]:
#find only negative value with out NaN:
df.values[df.values<0]

array([-0.07490528, -1.52946057, -1.09448197, -0.94815606, -0.05424308,
       -0.74899894, -1.77140644, -0.15927453, -1.86971316, -0.843606  ,
       -1.15105644])

In [13]:
for i in df.index:
    for j in df.columns:
        if df.loc[i,j]<0:
            print(i,j," : ", df.loc[i,j])

a x  :  -0.07490527837136463
a u  :  -1.5294605704366822
b z  :  -1.0944819683922005
b y  :  -0.948156056174033
c z  :  -0.05424307658019977
c v  :  -0.7489989409702679
c u  :  -1.7714064390061783
d z  :  -0.15927453467681293
d y  :  -1.8697131575291837
d v  :  -0.8436060018327262
d u  :  -1.1510564360857196


In [15]:
df1 = df[df<0]
df1

Unnamed: 0,z,y,x,v,u
a,,,-0.074905,,-1.529461
b,-1.094482,-0.948156,,,
c,-0.054243,,,-0.748999,-1.771406
d,-0.159275,-1.869713,,-0.843606,-1.151056


In [16]:
df1.dropna(thresh = 3)

Unnamed: 0,z,y,x,v,u
c,-0.054243,,,-0.748999,-1.771406
d,-0.159275,-1.869713,,-0.843606,-1.151056


In [17]:
df1.dropna(thresh = 2)

Unnamed: 0,z,y,x,v,u
a,,,-0.074905,,-1.529461
b,-1.094482,-0.948156,,,
c,-0.054243,,,-0.748999,-1.771406
d,-0.159275,-1.869713,,-0.843606,-1.151056


In [18]:
df1.dropna(thresh = 4)

Unnamed: 0,z,y,x,v,u
d,-0.159275,-1.869713,,-0.843606,-1.151056


In [19]:
df2 = df1.copy()
df2

Unnamed: 0,z,y,x,v,u
a,,,-0.074905,,-1.529461
b,-1.094482,-0.948156,,,
c,-0.054243,,,-0.748999,-1.771406
d,-0.159275,-1.869713,,-0.843606,-1.151056


<bound method NDFrame._add_numeric_operations.<locals>.mean of a         NaN
b   -1.094482
c   -0.054243
d   -0.159275
Name: z, dtype: float64>

In [23]:
value  = {"z":df2.z.mean(),"y":df2.y.mean(),"x":df2.x.mean(),"v":df2.v.mean(),"u":df2.u.mean()}
df2.fillna(value = value)

Unnamed: 0,z,y,x,v,u
a,-0.436,-1.408935,-0.074905,-0.796302,-1.529461
b,-1.094482,-0.948156,-0.074905,-0.796302,-1.483974
c,-0.054243,-1.408935,-0.074905,-0.748999,-1.771406
d,-0.159275,-1.869713,-0.074905,-0.843606,-1.151056


In [7]:
#seed to keep data into same range and date_range for date generation:
np.random.seed(20)
df1 = pd.DataFrame(np.random.randn(10,4), pd.date_range("2021-01-10", periods=10), columns=list("ABCD"))
df1

Unnamed: 0,A,B,C,D
2021-01-10,0.883893,0.195865,0.357537,-2.343262
2021-01-11,-1.084833,0.559696,0.939469,-0.978481
2021-01-12,0.503097,0.406414,0.323461,-0.493411
2021-01-13,-0.792017,-0.842368,-1.279503,0.245715
2021-01-14,-0.044195,1.567633,1.051109,0.406368
2021-01-15,-0.168646,-3.189703,1.120132,1.332778
2021-01-16,-0.243339,-0.130031,-0.109017,1.556186
2021-01-17,0.128778,-2.066949,-0.885493,-1.104579
2021-01-18,0.932866,2.059838,-0.934938,-1.61299
2021-01-19,0.52707,-1.551101,0.329613,-1.136527


In [8]:
df1["E"] = df1['A'].apply(lambda x : x + 2)
df1

Unnamed: 0,A,B,C,D,E
2021-01-10,0.883893,0.195865,0.357537,-2.343262,2.883893
2021-01-11,-1.084833,0.559696,0.939469,-0.978481,0.915167
2021-01-12,0.503097,0.406414,0.323461,-0.493411,2.503097
2021-01-13,-0.792017,-0.842368,-1.279503,0.245715,1.207983
2021-01-14,-0.044195,1.567633,1.051109,0.406368,1.955805
2021-01-15,-0.168646,-3.189703,1.120132,1.332778,1.831354
2021-01-16,-0.243339,-0.130031,-0.109017,1.556186,1.756661
2021-01-17,0.128778,-2.066949,-0.885493,-1.104579,2.128778
2021-01-18,0.932866,2.059838,-0.934938,-1.61299,2.932866
2021-01-19,0.52707,-1.551101,0.329613,-1.136527,2.52707


In [12]:
#Task is to create is_A_neg column which will give 1 for positive and 0 for negative

# creating function:
def is_neg(x):
    return 1 if x>0 else 0

df1["is_A_neg"] = df1["A"].apply(is_neg)
df1

Unnamed: 0,A,B,C,D,E,is_A_neg
2021-01-10,0.883893,0.195865,0.357537,-2.343262,2.883893,1
2021-01-11,-1.084833,0.559696,0.939469,-0.978481,0.915167,0
2021-01-12,0.503097,0.406414,0.323461,-0.493411,2.503097,1
2021-01-13,-0.792017,-0.842368,-1.279503,0.245715,1.207983,0
2021-01-14,-0.044195,1.567633,1.051109,0.406368,1.955805,0
2021-01-15,-0.168646,-3.189703,1.120132,1.332778,1.831354,0
2021-01-16,-0.243339,-0.130031,-0.109017,1.556186,1.756661,0
2021-01-17,0.128778,-2.066949,-0.885493,-1.104579,2.128778,1
2021-01-18,0.932866,2.059838,-0.934938,-1.61299,2.932866,1
2021-01-19,0.52707,-1.551101,0.329613,-1.136527,2.52707,1


In [15]:
# making seprate column for date time:-
df1["date"],df1["month"],df1["year"] = df1.index.day,df1.index.month,df1.index.year
df1

Unnamed: 0,A,B,C,D,E,is_A_neg,date,month,year
2021-01-10,0.883893,0.195865,0.357537,-2.343262,2.883893,1,10,1,2021
2021-01-11,-1.084833,0.559696,0.939469,-0.978481,0.915167,0,11,1,2021
2021-01-12,0.503097,0.406414,0.323461,-0.493411,2.503097,1,12,1,2021
2021-01-13,-0.792017,-0.842368,-1.279503,0.245715,1.207983,0,13,1,2021
2021-01-14,-0.044195,1.567633,1.051109,0.406368,1.955805,0,14,1,2021
2021-01-15,-0.168646,-3.189703,1.120132,1.332778,1.831354,0,15,1,2021
2021-01-16,-0.243339,-0.130031,-0.109017,1.556186,1.756661,0,16,1,2021
2021-01-17,0.128778,-2.066949,-0.885493,-1.104579,2.128778,1,17,1,2021
2021-01-18,0.932866,2.059838,-0.934938,-1.61299,2.932866,1,18,1,2021
2021-01-19,0.52707,-1.551101,0.329613,-1.136527,2.52707,1,19,1,2021


In [16]:
#converting months into name:
def months(x):
    dic = {1:"Jan", 2:"Feb", 3:"March", 4:"April", 5:"May"}
    return dic[x]
df1["month"] = df1["month"].apply(months)
df1

Unnamed: 0,A,B,C,D,E,is_A_neg,date,month,year
2021-01-10,0.883893,0.195865,0.357537,-2.343262,2.883893,1,10,Jan,2021
2021-01-11,-1.084833,0.559696,0.939469,-0.978481,0.915167,0,11,Jan,2021
2021-01-12,0.503097,0.406414,0.323461,-0.493411,2.503097,1,12,Jan,2021
2021-01-13,-0.792017,-0.842368,-1.279503,0.245715,1.207983,0,13,Jan,2021
2021-01-14,-0.044195,1.567633,1.051109,0.406368,1.955805,0,14,Jan,2021
2021-01-15,-0.168646,-3.189703,1.120132,1.332778,1.831354,0,15,Jan,2021
2021-01-16,-0.243339,-0.130031,-0.109017,1.556186,1.756661,0,16,Jan,2021
2021-01-17,0.128778,-2.066949,-0.885493,-1.104579,2.128778,1,17,Jan,2021
2021-01-18,0.932866,2.059838,-0.934938,-1.61299,2.932866,1,18,Jan,2021
2021-01-19,0.52707,-1.551101,0.329613,-1.136527,2.52707,1,19,Jan,2021


In [18]:
df1["date"] = df1.index.day_name()
df1

Unnamed: 0,A,B,C,D,E,is_A_neg,date,month,year
2021-01-10,0.883893,0.195865,0.357537,-2.343262,2.883893,1,Sunday,Jan,2021
2021-01-11,-1.084833,0.559696,0.939469,-0.978481,0.915167,0,Monday,Jan,2021
2021-01-12,0.503097,0.406414,0.323461,-0.493411,2.503097,1,Tuesday,Jan,2021
2021-01-13,-0.792017,-0.842368,-1.279503,0.245715,1.207983,0,Wednesday,Jan,2021
2021-01-14,-0.044195,1.567633,1.051109,0.406368,1.955805,0,Thursday,Jan,2021
2021-01-15,-0.168646,-3.189703,1.120132,1.332778,1.831354,0,Friday,Jan,2021
2021-01-16,-0.243339,-0.130031,-0.109017,1.556186,1.756661,0,Saturday,Jan,2021
2021-01-17,0.128778,-2.066949,-0.885493,-1.104579,2.128778,1,Sunday,Jan,2021
2021-01-18,0.932866,2.059838,-0.934938,-1.61299,2.932866,1,Monday,Jan,2021
2021-01-19,0.52707,-1.551101,0.329613,-1.136527,2.52707,1,Tuesday,Jan,2021
