In [None]:
import pandas as pd

#### Read in a CSV file into a pandas dataframe we'll call "df"
#### The csv reading function assumes that each person is on a new line, that the variable names are at the top of the file, and all entries are separated by a column

In [None]:
df = pd.read_csv('../dataset/titanic.csv')

#### You can view the variable/column names

In [None]:
list(df.columns.values)

#### You can access a specific column/variable using brackets and the column name in quotations

In [None]:
df['sex']

#### You can also access multiple columns at once

In [None]:

df[["sex","fare"]]

## Examine Categorical Variables for Irregularities

#### Examine the different values entered

In [None]:
pd.unique(df["sex"])

#### Make all string values lowercase

In [None]:
df['name'] = df['name'].apply(lambda x: x.lower())
print df['name']

#### Fix string values that might have an extra hidden spae before or after

In [None]:
df['name'] = df['name'].apply(lambda x: x.strip())
print df['name']

####  Fix string values that might be mispelled

In [None]:
import difflib
df['sex'] = df['sex'].apply(lambda x: difflib.get_close_matches(x,["male","female"])[0])
print df['sex']

## Examine quantitative variables for irregularities

#### View the central tendency, variability, minimum, maximum, and quartiles to check for impossible or unexpect values

In [None]:
df["fare"].describe()

#### View a histogram that shows the distribution of the variables.

In [None]:
import matplotlib.pyplot as plt
plt.figure();
df["fare"].plot.hist(alpha=0.5)
plt.show()

#### Restrict the values that data can take (upper and lower limits)

In [None]:
df.loc[:,'age'] = df['age'].clip(lower=0, upper=100)

## Handling Missing Data

#### Checking for missing data

In [None]:
missing = df.apply([pd.isnull(df['age'])]
print len(missing)

#### Drop rows that have a missing value for age

In [None]:
df = df[pd.notnull(df['age'])]
print df['age']

#### Replace missing values with the:
   1) median (middle value) for that column
   
   2) mode (most frequent value) for that column

In [None]:
df['age'].fillna(df['age'].median(), inplace=True)
df['age'].fillna(df['age'].mode(), inplace=True)

#### Replace missing values with the likely value

In [None]:
from sklearn import linear_model, preprocessing
import numpy as np


sex_encoder = preprocessing.LabelEncoder().fit(df["sex"])
df['sex_coded'] = sex_encoder.transform(df["sex"])

df_complete = df.dropna()
regr = linear_model.LinearRegression()
regr.fit(df_complete[["fare","sex_coded"]],df_complete['age'])

df.loc[:,'age'] = df.apply(lambda x: regr.predict(df[["fare","sex_coded"]])[0] if pd.isnull(x['age']) else x['age'], axis=1)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

sex_encoder = preprocessing.LabelEncoder().fit(df["sex"])
df['sex_coded'] = sex_encoder.transform(df["sex"])


pclass_encoder = preprocessing.LabelEncoder().fit(df["pclass"])
df['pclass_coded'] = pclass_encoder.transform(df["pclass"])

df_complete = df.dropna()

neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(df_complete[["fare","sex_coded","pclass_coded","sibsp"]], df_complete["age"])

df.loc[:,'age'] = df.apply(lambda x: neigh.predict(df[["fare","sex_coded","pclass_coded","sibsp"]])[0] if pd.isnull(x['age']) else x['age'], axis=1)
