# GEC Data Science Program
## Level 1, Lab 2

### Imports

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
%matplotlib inline

### Data load

We use Titanic data for this lab again.

In [None]:
d = pd.read_csv("train.csv", index_col=0)

In [None]:
d.head()

### More on Visualization

#### A nice visualization library: Seaborn
https://seaborn.pydata.org/

In [None]:
# install seaborn if needed
!pip install seaborn

In [None]:
import seaborn as sns

In [None]:
d.info()

In [None]:
d1=d[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].fillna(0)

In [None]:
d1.hist(layout=(2,3), figsize=(15,5));

In [None]:
sns.pairplot(d1, hue="Survived");

In [None]:
sns.violinplot(x="Pclass", y="Age", hue="Sex", data=d, split=True);

In [None]:
sns.swarmplot(x="Embarked", y="Fare", hue="Survived", data=d, split=True, color='y');

In [None]:
sns.heatmap(d1.corr(), annot=True)

In [None]:
sns.kdeplot(d1.Age, np.log(d1.Fare+1));

In [None]:
sns.kdeplot(d1.Age[d1.Survived==1], np.log(d1.Fare[d1.Survived==1]+1), shade=True, shade_lowest=False, cmap=plt.cm.Blues);
sns.kdeplot(d1.Age[d1.Survived==0], np.log(d1.Fare[d1.Survived==0]+1), shade=True, shade_lowest=False, cmap=plt.cm.Reds, alpha = 0.7);

## Pre-processing

### Q: What data types do we have in this dataset?

In [None]:
d.columns

In [None]:
d.head()

In [None]:
# 'Survived' : binary
# 'Pclass'   : categorical
# 'Name'     : string
# 'Sex'      : categorical
# 'Age'      : numerical
# 'SibSp'    : numercial
# 'Parch'    : numerical
# 'Ticket'   : string
# 'Fare'     : numerical
# 'Cabin'    : string
# 'Embarked' : categorical

### Q: How many unique values are there in each column?

### Q: Which columns can (or cannot) be used as features?

In [None]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [None]:
target = "Survived"

In [None]:
df = d[features]

In [None]:
df.head()

## Encoding

### Q: How do we deal with Categorical features?

In [None]:
#onehot encoding
pd.get_dummies(df, columns=["Sex"]).head()

In [None]:
df_encoded = pd.get_dummies(df, columns=["Sex","Pclass","Embarked"], drop_first=True)

In [None]:
df_encoded.head()

## Missing value imputation

### Q: How many missing values are there? And how do we deal with them?

In [None]:
df_encoded.isnull().sum()

In [None]:
df_encoded.Age.fillna(df_encoded.Age.mean(), inplace=True)

In [None]:
df_encoded.isnull().sum()

Other ways to deal with missing values:
- prediction (e.g. linear regression)
- LVCF (last value carry forward): for time series

## Transformation

### Distribution Normalizing and Scaling

In [None]:
df_encoded.hist(layout=(2,5), figsize=(15,5));

### Q: Which continous variables are not Normal (Gaussian) distribution. How do we make them more Normal?

In [None]:
plt.figure(figsize=(10,3))
plt.hist(df_encoded.Fare);

In [None]:
plt.figure(figsize=(10,3))
plt.hist(np.log(df_encoded.Fare+1));

In [None]:
dfn = df_encoded.copy()

dfn.Fare = np.log(df_encoded.Fare+1)

dfn.SibSp = np.log(df_encoded.SibSp+1)

dfn.hist(layout=(2,5), figsize=(15,5));

### Q: How do we make all features the same scale (zero mean and one std)?

In [None]:
dfn.mean()

In [None]:
dfn.std()

In [None]:
#Chan Yu:
dfn1 = (dfn-dfn.mean())/dfn.std()

In [None]:
dfn1.mean()

In [None]:
dfn1.std()

In [None]:
dfn1.hist(layout=(2,5), figsize=(15,5));

### Q: How do we scale train and test data?
    a. Scale each dataset independent of the other
    b. First scale all data then split to train and test
    c. Scale training data first, then scale test data the same way (using mean and std of training data)

b and c. Usually c.

### Q: How do we deal with outliers?

### A better way to Standardize the data (using scikit learn)

#### Scikit Learn is a very good MAchine Learning library
http://scikit-learn.org/

In [None]:
from sklearn import preprocessing

sc = preprocessing.StandardScaler()

dfn = sc.fit_transform(dfn)

Now we can scale the test data the same way. For example:

df_test = sc.transform(df_test)

### Principal Component Analysis (PCA)

In [None]:
from sklearn import decomposition

pca = decomposition.PCA()

df_pca = pca.fit_transform(dfn)

df_pca

### Q: Plot the first two principal components. Color 'sirvived' and 'died' differently.

In [None]:
len(dfn1.columns)

In [None]:
plt.scatter(df_pca[:,0], df_pca[:,1], c=d.Survived)

### Q: How much variation is explained by each Principal Component?

In [None]:
plt.plot(pca.explained_variance_ratio_)

## Linear Discriminant Analysis (LDA)

In [None]:
from sklearn import discriminant_analysis

In [None]:
lda = discriminant_analysis.LinearDiscriminantAnalysis()

In [None]:
df_lda = lda.fit_transform(dfn, d.Survived)

In [None]:
plt.hist([df_lda[d.Survived==0],df_lda[d.Survived==1]]);
plt.legend(['died','survived']);
plt.title("LDA");

In [None]:
plt.hist([df_pca[:,0][d.Survived==0],df_pca[:,0][d.Survived==1]]);
plt.legend(['died','survived']);
plt.title("PCA")

### Q: Which transformation better separates the classes?

LDA

### Q: Include 'title' from HW1 and create some visualizations.

### Q: Is there any way we can extract variables and plot 'Cabin' and 'Ticket' columns?