In [4]:
#set max number of rows and cols to display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

In [None]:
#check time to run in jupyter notebook
%time

In [None]:
#instantiate matplotlib plot
f, ax = plt.subplots(figsize=(6, 15))

#create a seaborn horizontal bar plot
sns.barplot(x=feature_importance.feature_importance, y=feature_importance.features, orient="h")

#Plotting multiple plots in one cell (but not on same graph)
series = week_sale["total"]
pyplot.figure()
pyplot.subplot(211) #the three digits stands for 2:# of rows, 1: # of cols, 1: the index of the plot (1st row)
plot_acf(series, ax=pyplot.gca())


In [None]:
#count number of NAs in each column
df.isnull().sum(axis=0)

#check columns that have NAs
df.columns[df.isna().any()]

#check percentage of NA per column
df.apply(lambda x: x.isna().sum()/len(df))

In [None]:
#changing dtypes of columns. All else selected is "number"
boolean = ["Lpresent","HDpresent"]
categorical = ["areaname", "county", "state"]

hd_lowes[boolean] = hd_lowes[boolean].apply(lambda x: x.astype("bool"))
hd_lowes[categorical] = hd_lowes[categorical].apply(lambda x: x.astype("category"))

In [None]:
#Imputing missingness by grouping
#The following df has 3 variables: PClass, SexCode, Age

imputation_dict = df.groupby(["PClass", "SexCode"]).mean().to_dict()
imputation_dict['Age']

#impute NAs with most recent last value
df.fillna(method="ffill")

In [None]:
#Create mapping table to fill NAs based on another column
mapping_table = df.groupby("col").agg({"col":lambda x: int(x.mean())}).to_dict()

#Fill NAs based on mapping table above (example from Simulmedia codingchallenge)
df["col"].fillna(df["col"].map(mapping_table))

In [None]:
#Create an ordered set of distinct values
from collections import OrderedDict 
"".join(OrderedDict.fromkeys(str))

In [None]:
#Create a grouper that groups data by weeks (W-Fri means it is a weekly grouper which ends on Fri)
#must transform index to DatetimeIndex first
grouper    = pd.Grouper(freq='W-FRI')

In [None]:
#Change column to datetime
df["date"] = pd.to_datetime(df["date"])

#Change to datetime and get year/month
pd.to_datetime(df['column']).year
pd.to_datetime(df['column']).month

#if column is already datetime, can use the following:
df["column"].dt.to_period("M") # this will give YYYY-MM format

In [None]:
#pivot tables: the below finds the average monthly temperature for each city 
df.pivot_table(index=pd.Grouper(freq='M',key='date'), columns='city', values='temperature')

In [None]:
#format='%m/%d/%y %H:%M'
def str_to_datetime(df, features):
    for feature in features:
        df[feature]=pd.to_datetime(df[feature], format='%m/%d/%y %H:%M')
    
#format='%m/%d/%y'
def str_to_date(df, features):
    for feature in features:
        df[feature]=pd.to_datetime(df[feature], format='%m/%d/%y')

#format='%m/%d/%Y'
def str_to_date1(df, features):
    for feature in features:
        df[feature]=pd.to_datetime(df[feature], format='%m/%d/%Y')

#str to numeric
def num(df, features):
    for feature in features:
        df[feature]=df[feature].replace('[$,() ]', '', regex=True).astype(float)

In [None]:
#Visualize distribtion of binned age
binned_hh_age_count = df["binned_head_of_household_age"].value_counts()
binned_second_hh_age_count = df["binned_second_head_of_household_age"].value_counts()

#Plot head of household age distrib
fig,(ax0,ax1) = plt.subplots(2,1)
fig.subplots_adjust(hspace=0.5)

binned_hh_age_count.loc[labels].plot(kind="bar",
                         ax=ax0,
                         figsize=(10,9),
                         title="Distribution of head of household age")
ax0.set_xlabel("Age")
ax0.set_ylabel("Frequency")

### Modeling stuff

In [None]:
#changing column from categorical to 1's and 0's
df["column"].apply(lambda x: 1 if x=="Yes" else 0)

In [None]:
#Stratefied train/test split (splitting 80/20 split) - for categorical variable
from sklearn.model_selection import train_test_split

#here the stratify=y is referring to y, the predictor variable and not "yes"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


#Stratefied for continuous variable, you will first have to create col with bins to make it 'categorical'
housing["income_cat"]=np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] <5, 5.0, inplace=True) #keep original data if <5, anything above bucket as 5

X_train, X_test, y_train, y_test = train_test_split(housing.drop(columns=["income_cat","median_house_value"], axis=1), 
                                                    housing["median_house_value"],
                                                    test_size=0.2, random_state=42, 
                                                    stratify=housing["income_cat"])

In [None]:
#Dummify categorical variables
use_columns = [categorical variables to use]

pd.get_dummies(df[use_columns], drop_first=True, dummy_na=True)

In [None]:
#Gridsearch with RandomForest
grid_para_forest = [{
    "n_estimators": range(100,201,50),
    "n_jobs": [-1],
    "max_features": ["auto","sqrt"],
    "min_samples_leaf": [1,2,3],
    "random_state": [42]
    }]

#n_jobs = -1 uses all processors in your computer to do parallel run 
grid_search_forest = GridSearchCV(randomForest, grid_para_forest, scoring='roc_auc', cv=5, n_jobs=-1)

In [None]:
#Feature Importance with visualizations

xgb_FI = pd.Series(data=grid_xgb.best_estimator_.feature_importances_, 
                   index=X.columns)

xgb_FI.sort_values().plot(kind="barh")

In [None]:
# TfidfVectorizer is a module that assigns weights to texts 
# common words will have low weights and special words have high weights

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

model = make_pipeline(TfidfVectorizer(), MultinomialNB()
model.fit(train_x, train_y)

In [None]:
#Confusion matrix and heatmap
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test_y, predicted_y)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
           xticklabels=train_y,
           yticklabels=train_y)
plt.xlabel('True Labels')
plt.ylabel("Predicted Labels")

In [None]:
#Kmeans elbow plot
from sklearn.cluster import KMeans

k = range(1, 10)
km = [KMeans(n_clusters=i) for i in Ks]
score = [km[i].fit(my_matrix).score(my_matrix) for i in range(len(km))]

### Pipelines

In [None]:
#packages

import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, Imputer, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

#Dataframe Selector for preprocessing in a pipeline
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

numeric_feats = train.dtypes[train.dtypes != "object"].index

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(numeric_feats)),
    ('imputer', SimpleImputer(strategy="median")),
    ('scalar', StandardScaler()),
])

In [None]:
#Column Selector

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

# Select columns you want by specifying which ones to exclude
x_cols = [c for c in df if c not in ["target", "phone number"]]

In [None]:
#build a TypeSelector for the pipeline
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [None]:
#build a StringIndexer to allow onehotencoder to work properly
class StringIndexer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.apply(lambda s: s.cat.codes.replace(
            {-1: len(s.cat.categories)}
        ))

In [None]:
#build out transformer
## IMPORTANT: only include the dtypes that the df actually has, 
## ie if X_train doesnt have bool dtype cols, it will return an error!

preprocess_pipeline = Pipeline([
    ('features', FeatureUnion(n_jobs=1, transformer_list=[
        ('boolean', Pipeline([
            ('selector', TypeSelector('bool')),
            ('imputer', Imputer(strategy="most_frequent")),
        ])),  # booleans close
        
        ('numericals', Pipeline([
            ('selector', TypeSelector(np.number)),
            ('imputer', Imputer(strategy="median")),
            ('scaler', StandardScaler()),
        ])),  # numericals close
        
        ('categoricals', Pipeline([
            ('selector', TypeSelector('category')),
            ('imputer', Imputer(strategy="most_frequent")),
            ('labeler', StringIndexer()),
            ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ]))  # categoricals close
    ])),  # features close
])  # pipeline close

In [None]:
# Full pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

#Reading the training and testing data 
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

#dummify the dataset to handle categorical data
train = pd.get_dummies(train, drop_first=True)

#Dataframe Selector for preprocessing in a pipeline
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]
    
#Create pipeline for numerical data
num_features = train.dtypes[train.dtypes != "object"].index

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_features)),
    ('imputer', SimpleImputer(strategy="median")),
    ('scalar', StandardScaler()),
])

#Separating target and predictor variables
X = train.drop(["SalePrice"], axis=1)
y = train["SalePrice"]

#Log transformation of Saleprice
y_log = np.log1p(y)

X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.25)

X_train = num_pipeline.fit_transform(X_train)
X_val = num_pipeline.fit_transform(X_val)

### Modeling

In [None]:
#Compare models
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score

#Training RMSE:

##Lasso
lasso = Lasso()
las_scores = cross_val_score(lasso, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print("Lasso Train RMSE: {}".format(np.sqrt(-las_scores).mean())) 

##RandomForest
rf = RandomForestRegressor()
rf_scores = cross_val_score(rf, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print("Random Forest Train RMSE: {}".format(np.sqrt(-rf_scores).mean()))

##XGBoost
xgb = XGBRegressor(n_jobs=-1)
xgb_scores = cross_val_score(xgb, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print("XGBoost Train RMSE: {}".format(np.sqrt(-xgb_scores).mean()))

#Validation RMSE:

##Lasso
lasso.fit(X_train,y_train)
print("Lasso Test RMSE: {}" .format(np.sqrt(mean_squared_error(y_val, lasso.predict(X_val)))))

##RandomForest
rf.fit(X_train,y_train)
print("RF Test RMSE: {}" .format(np.sqrt(mean_squared_error(y_val, rf.predict(X_val)))))

##XGBoost
xgb.fit(X_train,y_train)
print("XGB Test RMSE: {}" .format(np.sqrt(mean_squared_error(y_val, xgb.predict(X_val)))))

In [None]:
#Classifier Switcher
from sklearn.base import BaseEstimator

class ClfSwitcher(BaseEstimator):

    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)