# Data Cleaning Notebook
## Things that you'll want to do to pandas data frames a lot

* Natalia Zhang wenshuo.zhang@gmail.com
* Control-F to find what you need

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

## Frequency counts with NAs

In [None]:
df.column.value_counts(dropna=False)

## Cross-tabs and Pivot Tables    

In [None]:
pd.crosstab(margins=True, dropna=False, normalize='index') #marginal values, drop NAs, relative frequency?

In [None]:
df.pivot_table(values="seniority", index=["company_id", "current"], aggfunc=np.mean) #values is one thing, but index can be very many, though you won't be able to see everything after a while

#aggregate functions include basically any numpy function
#probably anything here: https://docs.scipy.org/doc/numpy-1.15.1/reference/routines.statistics.html

## Create Dummy Variables in PD
aka don't use OHE in sklearn, it's just not worth it

In [None]:
dummydf = pd.get_dummies(df.categorical_variable)
newdf = pd.concat([df, dummydf], axis=1) #R cbind for df

## Clean Ill-formatted dates into datetime format

In [None]:
#are dates in strings with different formats?
#Need to return a datetime object?
#better hope they're mal-formed in some consistent manner

import datetime

def formatdt(dt):    
    if dt is None:
        return np.NaN
    elif "-" in str(dt): #we're blithely assuming that the other formula is YYYY-MM-DD 
        return datetime.datetime.date(datetime.datetime.strptime(dt, '%Y-%m-%d'))
    elif "." in str(dt):
        return datetime.datetime.date(datetime.datetime.strptime(dt, '%m.%d.%Y'))

## Basic Visualizations

This is adapted from Lauren's code.

In [None]:
#Basic plotting
sns.set(style='white')
sns.set(style='whitegrid', color_codes=True)

sns.distplot(df['feature2'])
plt.show()

sns.distplot(df['feature2'])
plt.show()

sns.countplot(y='feature3', data=df)
plt.show()

sns.countplot(y='feature4', data=df)
plt.show()

## Heatmap of Correlations

In [None]:
sns.heatmap(ec2features.corr(), annot=False)

## Histogram

In [None]:
pd.hist(columns = X, by = Y, bins = 10)

#full options:
#pd.hist(column=None, by=None, grid=True, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=10)

## Overlapping Histogram Function
Scott's

In [None]:
fig, ax = plt.subplots()
sns.distplot(df.col.loc(filter) & grouping_var = value,ax=ax)
sns.distplot(df.col.loc(filter) & grouping_var = value+1,ax=ax)

Alternately, in SNS (I think this is Inh's)

In [None]:
sns.distplot(df[df['col_name'] == x]['target_var'], kde=True, rug=False, label = "Label1")
sns.distplot(df[df['col_name'] == y]['target_var'], kde=True, rug=False, label = "Label2")
plt.legend()
plt.show()

## Boxplot

Inline Pandas method

In [None]:
pd.boxplot(column=X, by=Y)

#full options:
#pd.boxplot(column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds)[source]

Seaborn method, comparative

In [None]:
fig, ax = plt.subplots(1,1)
ax = sns.boxplot(x = df['grouping_var'] == 0, y = df['target_var'])

## Faceted graphs (aka multiple graphs) in one go

In [None]:
#This one is mine

g= sns.catplot("dept","current", col="company_id", data=ec, kind="bar", height=2.5, aspect=.8, col_wrap=6) 
#catplot is generic
#we can specify (strip, swarm, box, violin, boxen, point, bar, count)
#https://seaborn.pydata.org/generated/seaborn.catplot.html

g.set_xticklabels(rotation=30, ha='right')

In [None]:
# Seniority
# whose?

fig, axs = plt.subplots(6,1, figsize = (6,20))
i = 0
for col in set(list(df['dept'])): # get unique dept
    df_tmp = df[df['dept'] == col]
    sns.distplot(df_tmp[df_tmp['quitters'] == 1]['seniority'], kde=True, rug=False, ax = axs[i], label = "quit")
    sns.distplot(df_tmp[df_tmp['quitters'] == 0]['seniority'], kde=True, rug=False,
                 ax = axs[i], axlabel= col, label = "No quit")
    i += 1

plt.tight_layout()
plt.legend()
plt.show()

## Visualize multiple categories at once!

In [None]:
#Whitney's, possibly through Michelle

features=['f1','f2','f3','f4', 'f5']
fig=plt.subplots(figsize=(10,15))

for i, j in enumerate(features):
    plt.subplot(4, 2, i+1)
    plt.subplots_adjust(hspace = 1.0)
    sns.countplot(x=j,data = df)
    plt.xticks(rotation=90)
    plt.title("No. of employee")

## Extracting Feature Importance from Sklearn

In [None]:
#Check feature importance
features = Xvars
importance = np.std(X_scaled, 0)*classifier.coef_
importance_CV = np.std(X_scaled, 0)*clf_CV.coef_

print('Feature importance, logistic regression:')
for i in range(len(features)):
    print(features[i], ':', str(importance[0][i]))
print('')
print('Feature importance, logistic regression with cross-validation:')
for i in range(len(features)):
    print(features[i], ':', str(importance_CV[0][i]))

## Confusion Matrix Visualization

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

labels = ['label1', 'label2']

matrix = confusion_matrix(y_test, test_predict)
sns.heatmap(matrix.T, square = True, annot=True, fmt='d',cbar=False, 
            xticklabels=labels, yticklabels=labels)
plt.xlabel("Actual Label")
plt.ylabel("Predicted Label")