# GEC Data Science Program
## Level 1, Lab 3

### Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn import preprocessing, model_selection

In [None]:
sns.set(style="darkgrid", color_codes=True)
%matplotlib inline

### Regression -- Predicting Passengers' Age
As we know some Age values are missing. 
Let's create a regression model to predict missing Age values.

### Data Loading and Preprocessing

In [None]:
d = pd.read_csv("train.csv", index_col=0)

In [None]:
features = ['Survived','Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Age'

In [None]:
df = d[features]
y = d[target]

#### Encoding

In [None]:
features_to_encode = ["Sex","Pclass","Embarked"]
df_clean = pd.get_dummies(df, columns=features_to_encode, drop_first=True)

#### Log Transformation

In [None]:
df_clean.Fare = np.log(df_clean.Fare+1)

#### Normalizarion (z-scoring)

In [None]:
features_to_normalize = ['Fare','Parch','SibSp']
df_tmp=df_clean[features_to_normalize]

In [None]:
df_tmp = (df_tmp - df_tmp.mean())/df_tmp.std()

In [None]:
df_clean[features_to_normalize] = df_tmp

In [None]:
df_clean.head()

#### Histogram Inspection

In [None]:
df_clean.hist(layout=(2,5), figsize=(15,5));

#### Missing values inspection

In [None]:
df_clean.isnull().sum()

### Q: Any highly correlated features?

In [None]:
sns.heatmap(df_clean.corr(), annot=True);

### Split Data

In [None]:
idx_age_missing = y.isnull()

In [None]:
df_missing_age = df_clean.loc[idx_age_missing]

In [None]:
df1 = df_clean.loc[~idx_age_missing]
y1 = y.loc[~idx_age_missing]

In [None]:
train_idx, test_idx = model_selection.train_test_split(df1.index, test_size=0.2)

In [None]:
X_train = df1.loc[train_idx]
X_test  = df1.loc[test_idx]
y_train = y1.loc[train_idx]
y_test  = y1.loc[test_idx]

In [None]:
len(X_train), len(X_test)

### Linear Regression

In statistics, linear regression is a linear approach for modeling the relationship between a scalar dependent variable $y$ and one or more explanatory variables (or independent variables) denoted $x$.

The case of one explanatory variable is called simple linear regression: $$ y= \beta x+\epsilon $$ 

For more than one explanatory variable, the process is called multiple linear regression: $$y=\beta_1 x_1+\beta_2 x_2+...+\beta_n x_n+\epsilon $$

### Model Fitting


In [None]:
plt.scatter(X_train.Fare, y_train);

In [None]:
model = linear_model.LinearRegression()

In [None]:
model.fit(X_train, y_train, );

In [None]:
print ("Fitted model")
y_fitted = model.predict(X_train)
g = sns.jointplot(y_train, y_fitted, kind="reg", size=7);
g.set_axis_labels(xlabel="Age",ylabel="fitted Age");

In [None]:
print ('Intercept:',model.intercept_)
print ('Coefficients:')
coefs_df = pd.DataFrame(index=X_train.columns,data=model.coef_, columns=['coef']);
coefs_df.sort_values('coef', ascending=False).plot.barh();

### Q: Which features are more important?

### Q: Is it OK to use Survived as a feature to predict Age?

### Predicting


In [None]:
y_pred = model.predict(X_test)

### Model Evaluation

In [None]:
print ("Correlation between predictions and actual values:",np.corrcoef(y_pred,y_test)[0,1])

In [None]:
print ("Predictions")
g = sns.jointplot(y_test, y_pred, kind="reg", size=7);
g.set_axis_labels(xlabel="Age",ylabel="predicted Age");

In [None]:
from sklearn import metrics
import numpy as np

In [None]:
print ("MSE:",metrics.mean_squared_error(y_test, y_pred))

print ("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

print ("r^2 score:",metrics.r2_score(y_true=y_test, y_pred=y_pred))

In [None]:
residuals = y_test - y_pred

In [None]:
plt.hist(residuals);

### Other regression algorithms
http://scikit-learn.org/stable/supervised_learning.html#supervised-learning 

### Lasso Regression

In [None]:
lasso = linear_model.Lasso()
lasso.fit(X_train,y_train)

In [None]:
print ('Intercept:',lasso.intercept_)
print ('Coefficients:')
coefs_df = pd.DataFrame(index=X_train.columns,data=lasso.coef_, columns=['coef'])
coefs_df.sort_values('coef', ascending=False).plot.barh()

In [None]:
print ("Lasso - Fitted model")
y_fitted = lasso.predict(X_train)
g = sns.jointplot(y_train, y_fitted, kind="reg", size=7);
g.set_axis_labels(xlabel="Age",ylabel="fitted Age");

In [None]:
y_pred = lasso.predict(X_test)

In [None]:
print ("Lasso - Predictions")
g = sns.jointplot(y_test, y_pred, kind="reg", size=7);
g.set_axis_labels(xlabel="Age",ylabel="predicted Age");

In [None]:
print ("MSE:",)
print (metrics.mean_squared_error(y_test, y_pred))

print ("RMSE:",)
print (np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

print ("r^2 score:",)
print (metrics.r2_score(y_true=y_test, y_pred=y_pred))

### Support Vector Machine (SVM)

In [None]:
from sklearn import svm

In [None]:
model_svm = svm.SVR(C=100)
model_svm.fit(X_train, y_train) 
y_pred = model_svm.predict(X_test)

In [None]:
#print "SVM - Fitted model"
y_fitted = model_svm.predict(X_train)
g = sns.jointplot(y_train, y_fitted, size=7);
g.set_axis_labels(xlabel="Age",ylabel="fitted Age");

In [None]:
y_pred = model_svm.predict(X_test)

In [None]:
#print "SVM - Predictions"
g = sns.jointplot(y_test, y_pred, size=7);
g.set_axis_labels(xlabel="Age",ylabel="predicted Age");

In [None]:
print ("MSE:",metrics.mean_squared_error(y_test, y_pred))

print ("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

print ("r^2 score:",metrics.r2_score(y_true=y_test, y_pred=y_pred))

### Q: How to fine-tune the hyper parameters?

### Q: Which model is best so far?

## Homework

### Q: How to improve accuracy? What if we use "Title"?

### Q: Use another regression algorithm and see if it performs better.

### Q: Using the best model, predict and impute Age for the passengers whos Age is missing. Save your final dataset.