In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf


In [2]:
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/titanic.csv"
TitanicRawData = pd.read_csv(url)

In [3]:
TitanicRawData.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
TitanicRawData.head(6)

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


### Single Imputation - Mean

In [5]:
del TitanicRawData['cabin']

In [6]:
del TitanicRawData['name']

#### Let's replace missing values by mean of age

In [7]:
TitanicRawData['age'].fillna(value = np.mean(TitanicRawData['age']), inplace = True)

TitanicRawData.head(6)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket,fare,embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,female,35.0,1,0,113803,53.1,S
4,0,3,male,35.0,0,0,373450,8.05,S
5,0,3,male,29.699118,0,0,330877,8.4583,Q


In [8]:
TitanicRawData.describe()  # We don't have any other missing values but what we did was a terible practice!

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Single Imputation - Regression Lines

In [9]:
# Now let's run a model and see what is the best way to predict age
TitanicRawData.corr()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
survived,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307
pclass,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495
age,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566
sibsp,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651
parch,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225
fare,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0


In [10]:
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/titanic.csv"
TitanicRawData = pd.read_csv(url)
TitanicDroppedValues = TitanicRawData.dropna(subset = ['age'])


In [11]:
X1 = TitanicDroppedValues[['pclass','sibsp','fare','survived']]
y1 = TitanicDroppedValues['age']

In [12]:
lm1 = smf.ols(formula = 'y1 ~ X1', data = TitanicDroppedValues).fit()
print(lm1.pvalues)


Intercept    3.241913e-113
X1[0]         1.102657e-28
X1[1]         7.458058e-16
X1[2]         1.036460e-01
X1[3]         2.629116e-11
dtype: float64


#### It looks like 'fare' is not significant

In [13]:
X2 = TitanicDroppedValues[['pclass','sibsp','survived']]
y2 = TitanicDroppedValues['age']

In [14]:
lm2 = smf.ols(formula = 'y2 ~ X2', data = TitanicDroppedValues).fit()
print(lm2.pvalues)

Intercept    1.880023e-140
X2[0]         3.678235e-33
X2[1]         1.136303e-17
X2[2]         8.904637e-12
dtype: float64


In [15]:
lm = LinearRegression()

lm.fit(X2, y2)

X = TitanicRawData[['pclass','sibsp','survived']]

TitanicRawData['age_predict'] = lm.predict(X)
TitanicRawData['age'].fillna(value = TitanicRawData['age_predict'], inplace = True)

In [16]:
TitanicRawData.head(6)

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,age_predict
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,24.643607
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,32.695841
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22.00189
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,32.695841
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,29.066091
5,0,3,"Moran, Mr. James",male,29.066091,0,0,330877,8.4583,,Q,29.066091


#### Single Imputation Regression line with Error

In [17]:
url = "https://raw.githubusercontent.com/ga-students/DS-SF-24/master/Data/titanic.csv"
TitanicRawData = pd.read_csv(url)
TitanicDroppedValues = TitanicRawData.dropna(subset = ['age'])

linreg = LinearRegression()

X = TitanicDroppedValues[['pclass','sibsp','survived']]
y = TitanicDroppedValues['age']

linreg.fit(X,y)

y_hat = linreg.predict(X)

Standard_error = ( sum( (y - y_hat) ** 2 )/(len(y) - 2) ) ** .5

print(Standard_error)



12.4478080029


In [18]:
y_min = y.min()
y_max = y.max()
X = TitanicRawData[['pclass','sibsp','survived']]
print(len(TitanicRawData))
#let's add error term to our prediction

TitanicRawData['predicted_age_error'] = linreg.predict(X) + np.random.normal(0, Standard_error, len(TitanicRawData))


print('Minimum Values before adjustment %f:'  %TitanicRawData['predicted_age_error'].min())


##### Here we make sure we don't predict beyond or below our age limits

TitanicRawData.loc[(TitanicRawData['predicted_age_error'] < y_min), 'predicted_age_error'] = y_min 
TitanicRawData.loc[(TitanicRawData['predicted_age_error'] > y_max), 'predicted_age_error'] = y_max

print('Minimum Values after adjustment %f:'  %TitanicRawData['predicted_age_error'].min())


TitanicRawData['age'].fillna(value = TitanicRawData['predicted_age_error'], inplace = True)


TitanicRawData.head(20)


891
Minimum Values before adjustment -20.880879:
Minimum Values after adjustment 0.420000:


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,predicted_age_error
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,14.710506
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,64.654674
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,14.944866
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,30.604651
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,66.076407
5,0,3,"Moran, Mr. James",male,14.44295,0,0,330877,8.4583,,Q,14.44295
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,48.517401
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0.42
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,18.395932
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,5.583686
