## Pandas basics

#### Start out by importing the pandas library, and we'll tell Python that we're referring to it as "pd" (for the sake of brevity)

In [None]:
import pandas as pd

#### Read in a CSV file into a pandas dataframe we'll call "df"
#### The csv reading function assumes that each person is on a new line, that the variable names are at the top of the file, and all entries are separated by a column

In [None]:
df = pd.read_csv('../dataset/titanic.csv')

#### You can view the variable/column names

In [None]:
list(df.columns.values)

#### You can view how many data entries/cases there are

In [None]:
print len(df.index)

#### You can access a specific column/variable using brackets and the column name in quotations

In [None]:
df['sex']

#or

df.sex

#### You can also access multiple columns at once

In [None]:

df[["sex","fare"]]

#### You can view either the first N or last N set of rows

In [None]:
N = 5
print df.head(5)
print df.tail(5)

#### You can recode a variable by mapping on the old value to new values

In [None]:
df['pclass'] = df["pclass"].astype("str")
df["pclass"].replace({'1': 'upper', '2' : 'middle', '3' : "lower"})

In [None]:
df.ix[df.age >= 18, 'age_recode'] = "adult"
df.ix[df.age < 18, 'age_recode'] = "child"
print df['age_recode']

#### You can create new variables

In [None]:
df["familymembers"] = df["sibsp"] + df["parch"] 
print df[["sibsp","parch","familymembers"]]

#### You can code variables by their range

In [None]:
df["alone_or_group"] = df['familymembers'].apply(lambda x: 'alone' if x < 1 else 'group')

#### You can combine two different data sources that share a common attribute (e.g., "name")

In [None]:
survivors = df[["name","survived"]]
ticketprices = df[["name","fare"]]
merged = survivors.merge(ticketprices, left_on="name", right_on="name")

## Examine Categorical Variables for Irregularities

#### Examine the different values entered

In [None]:
pd.unique(df["sex"])

#### Make all string values lowercase

In [None]:
df['name'] = df['name'].apply(lambda x: x.lower())
print df['name']

#### Fix string values that might have an extra hidden spae before or after

In [None]:
df['name'] = df['name'].apply(lambda x: x.strip())
print df['name']

####  Fix string values that might be mispelled

In [None]:
import difflib
df['sex'] = df['sex'].apply(lambda x: difflib.get_close_matches(x,["male","female"])[0])
print df['sex']

## Examine quantitative variables for irregularities

#### View the central tendency, variability, minimum, maximum, and quartiles to check for impossible or unexpect values

In [None]:
df["fare"].describe()

#### View a histogram that shows the distribution of the variables.

In [None]:
import matplotlib.pyplot as plt
plt.figure();
df["fare"].plot.hist(alpha=0.5)
plt.show()

#### Restrict the values that data can take (upper and lower limits)

In [None]:
df.loc[:,'age'] = df['age'].clip(lower=0, upper=100)

## Handling Missing Data

#### Checking for missing data

In [None]:
missing_count = df['age'].isnull().sum()
print missing_count

missing_data = df[pd.isnull(df['age'])]
print missing_data

#### Drop rows that have a missing value for age

In [None]:
df = df[pd.notnull(df['age'])]
print df['age']

#### Replace missing values with the:
   1) median (middle value) for that column
   
   2) mode (most frequent value) for that column

In [None]:
df['age_medianimpute'] = df['age'].fillna(df['age'].median())
df['age_modeimpute'] = df['age'].fillna(df['age'].mode())

#### Replace missing values with the likely value

#### Use a linear regression formula to predict what the missing value would have been

In [None]:
from sklearn import linear_model, preprocessing
import numpy as np


sex_encoder = preprocessing.LabelEncoder().fit(df["sex"])
df['sex_coded'] = sex_encoder.transform(df["sex"])

df_complete = df.dropna()
regr = linear_model.LinearRegression()
regr.fit(df_complete[["fare","sex_coded"]],df_complete['age'])

df.loc[:,'age'] = df.apply(lambda x: regr.predict(df[["fare","sex_coded"]])[0] if pd.isnull(x['age']) else x['age'], axis=1)

#### Use the 2 most similar cases to predict what the missing value would have been

In [None]:
from sklearn.neighbors import KNeighborsRegressor

sex_encoder = preprocessing.LabelEncoder().fit(df["sex"])
df['sex_coded'] = sex_encoder.transform(df["sex"])


pclass_encoder = preprocessing.LabelEncoder().fit(df["pclass"])
df['pclass_coded'] = pclass_encoder.transform(df["pclass"])

df_complete = df.dropna()

neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(df_complete[["fare","sex_coded","pclass_coded","sibsp"]], df_complete["age"])

df.loc[:,'age'] = df.apply(lambda x: neigh.predict(df[["fare","sex_coded","pclass_coded","sibsp"]])[0] if pd.isnull(x['age']) else x['age'], axis=1)


## Apply the lesson

#### Import the pandas library (call it pd for short). 

#### Read in the dataset called "redwinequality.csv" in the datasets folder and save it to a dataframe called "df"

In [None]:
import pandas as pd
df = pd.read_csv('../dataset/redwinequality.csv')

#### What are the variables collected on each wines?

In [None]:
print list(df.columns.values)

#### What is the maximum and minimum pH value?

In [None]:
print df['pH'].max()
print df['pH'].min()

#or 

print df['pH'].describe()

#### How does the distribution of quality ratings look like?

In [None]:
import matplotlib.pyplot as plt
plt.figure();
df["quality"].plot.hist(alpha=0.5)
plt.show()

#### Some of the volatile acidity measurements are missing. How many of the measurements are missing? What proportion of wines are missing a volatile acidity rating?

In [None]:
missing_count = df['volatileacidity'].isnull().sum()
print missing_count
print missing_count / float(len(df.index))

#### Fill in the missing values for volatile acidity with the median. Create a new column with the filled in missing data. Call that column: va_medianimpute

In [None]:
df['va_medianimpute'] = df['volatileacidity'].fillna(df['volatileacidity'].median())

#### Fill in the missing values for volatile acidity with using the 8 most similar cases/neighbors.  Base similarity off the variables: fixedacidity, citricacid, residualsugar, and chlorides

#### Create a new column with the filled in missing data. Call that column: va_neighborimpute

In [None]:
from sklearn.neighbors import KNeighborsRegressor

df_complete = df.dropna()

neigh = KNeighborsRegressor(n_neighbors=8)
neigh.fit(df_complete[["fixedacidity","citricacid","residualsugar","chlorides"]], df_complete["volatileacidity"])

df.loc[:,'va_neighborimpute'] = df.apply(lambda x: neigh.predict(df[["fixedacidity","citricacid","residualsugar","chlorides"]])[0] if pd.isnull(x['volatileacidity']) else x['volatileacidity'], axis=1)

#### Load in the complete wine dataset (redwinequality_complete.csv), which has the true values for the missing values. Call the dataframe: df_true

#### Compare the absolute difference between the volatile acidity column of the complete dataset with va_medianimpute and the va_neighborimpute variables you created.

#### Which imputation method has the smallest MEAN/AVERAGE absolute difference?

In [None]:
df_true = pd.read_csv('../dataset/redwinequality_complete.csv')
print (df_true["volatileacidity"] - df['va_medianimpute']).abs().mean()
print (df_true["volatileacidity"] - df['va_neighborimpute']).abs().mean()

#### You want to recode the chlorides as either being "high" or "low" if the chloride level is above or below (or equal to) .25. Create a new variable called: "chlorides_category" that follows those rules. Show the new coded variable next to the old numeric variable.

In [None]:
df.chlorides.median()
df.ix[df.chlorides > df.chlorides.median(), 'chlorides_category'] = "high"
df.ix[df.chlorides <= df.chlorides.median(), 'chlorides_category'] = "low"
print df[['chlorides', 'chlorides_category']]