# Exploratory Data Analysis
This notebook contains boilerplate code for typical EDA steps:
- Import
- Summary Stats
- Cleaning
- Missing data
- Exploratory plots

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing data

In [None]:
# Reading a csv file without the annoying index column
a = pd.read_csv("a.csv", index_col = 0)

# Naming columns for a dataframe
colnames= ['column1', 'column2', 'etc']
a.columns = colnames

In [6]:
# Joining dataframes, SQL style
df = a.join(b, how = 'left', on = 'key')

## Summarizing & cleaning data

In [None]:
# data type per column
df.info()

In [None]:
# It's often helpful to have a count of unique values in each column:
def nunicol(df):
    summary = []
    for i in range(0, len(df.columns)):
        summary.append(df.iloc[:,i].nunique())
    
    summary = pd.DataFrame([summary])
    summary.columns = df.columns
    
    return summary

nunicol(df)

In [None]:
## Make crosstab table for initial overview. Also exposes misspelled feature levels.
ct = pd.crosstab([df.feature_1, df.feature_2, df.feature_3], df.target, , normalize='index')
ct.sort_values(by=1, ascending=False)

# normalize by 'index' gives percentages per row
# normalize by 'all' gives overall percentages
# to access a column, use e.g.: ct.iloc[:,-2]

In [None]:
# standardizing spellings/typos using a dictionary
df.replace({'column_name' : { 'wrong_1' : 'correct_1', 'wrong_2': 'correct_2'}}, inplace=True)

# display levels after replacing misspellings
df.column_name.unique()

### Dealing with missing data

In [None]:
# count NaNs in dataframe by column
df.isnull().sum(axis=0)

In [None]:
# removing rows or columns with NaNs
df.dropna(axis=0, inplace=True) # axis=0 for rows, axis=1 for columns

In [None]:
# Mean imputation
df.column.fillna(df.column.mean(), inplace=True)

In [None]:
# Imputation using sklearn
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean') # or 'median', 'most_frequent', 'constant'
# filling the dataframe
df_imped = imp.fit_transform(df)

# when dealing with separate train/test sets, carry out fit and transfor separately:
imp.fit(X_train)
X_test = imp.transform(X_test)

## Plotting raw data

In [None]:
# Univariate distribution plot (Histogram with optional kde and rug plot)
sns.distplot(df.column, kde=False, rug=False)

In [None]:
# Function to plot lines for  distributions
def plot_with_fill(x, y, label):
    lines = plt.plot(x, y, label=label, lw=2)
    plt.fill_between(x, 0, y, alpha=0.2, color=lines[0].get_c())
    plt.legend(loc='best')

In [None]:
# Correlation matrix
feature_names = list(df.columns[1:10])
label_name = list(df.columns[10:])

features = df[feature_names]

plt.figure(figsize=(10,10))
sns.heatmap(features.corr(), annot=True, square=True, cmap='coolwarm')
plt.show()