# 30 Day Kaggle Challenge
## Competition Phase


## Initialize Data

In [94]:
#Import the important packages
import pandas as pd
import os, datetime

#Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

#Validation Fix
from sklearn.model_selection import train_test_split

#Model Quality Fix
from sklearn.metrics import mean_squared_error
# for root mean squared error rmse
#rms = mean_squared_error(y, X, squared= False)

#Bundling Fix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#NaN Fix
from sklearn.impute import SimpleImputer

#Categorical/Object Data Fix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

#Plotting
import matplotlib.pyplot as plt

In [95]:
#Get and Display current working directory
cwd = os.getcwd()

#Change Directory to parent (one step up)
filepath_elements = cwd.split('/')
filepath_elements.pop()
filepath= '/'.join(filepath_elements)
filepath

#Save filenames and path as str in variables
datapath = '/30-days-of-ml/'
dataname_test = 'test.csv'
dataname_train = 'train.csv'
dataname_samplesubmit = 'sample_submission.csv'

#Read the csv files and initialize as Dataframes
data_test = pd.read_csv(filepath+datapath+dataname_test, index_col='id')
data_train = pd.read_csv(filepath+datapath+dataname_train, index_col='id')
data_samplesubmit = pd.read_csv(filepath+datapath+dataname_samplesubmit, index_col='id')

### Display Data for viewing


In [96]:
data_samplesubmit.head(10);

In [97]:
data_train.head(10);

In [98]:
data_test.head(10);

In [99]:
#Getting Columns
train_cols = list(data_train.columns)
print(train_cols);

['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'target']


In [100]:
train_cats = [col for col in train_cols if 'cat' in col]
train_conts = [col for col in train_cols if 'cont' in col]

In [101]:
test_cols = list(data_test.columns)
print(test_cols);

['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']


In [102]:
test_cats = [col for col in test_cols if 'cat' in col]
test_conts = [col for col in test_cols if 'cont' in col]

In [103]:
#Inspect for NaNs
print(data_train.isnull().sum().sum())
data_train.isnull().any();

0


In [104]:
print(data_test.isnull().sum().sum())
data_test.isnull().any();

0


In [105]:
#Inspect categorical data for cardinality
cardinals_train = list(map(lambda col: data_train[col].nunique(), train_cats))
save_cardinals_train = dict(zip(train_cats, cardinals_train))
sorted(save_cardinals_train.items(), key=lambda x: x[1]);

data_train.nunique();
data_train[train_cats].nunique();
list(map(lambda col: data_train[col].nunique(), train_cats));

In [106]:
cardinals_test = list(map(lambda col: data_test[col].nunique(), test_cats))
save_cardinals_test = dict(zip(train_cats, cardinals_test))
sorted(save_cardinals_test.items(), key=lambda x: x[1]);

In [107]:
#filter low cardinality columns
filter_limit = 10

low_cardinality_categorical_cols = [col for col in data_train.columns if data_train[col].nunique()<filter_limit and data_train[col].dtype == 'object']

#filter only numerical columns
numerical_cols = train_conts.copy()

#get columns to be used 
selected_cols = low_cardinality_categorical_cols + numerical_cols


#### "Cardinality" means the number of unique values in a column
#### Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

#### Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

#### Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [108]:
#Get full X and full y from data_train and thus X_test from data_test

y_full = data_train.target.copy()

X_full = data_train[selected_cols].copy()

X_test = data_test[selected_cols].copy()

X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [109]:
#Make your pipelines

#Preprocessing for numerical data; no NaNs in data means not used though
numerical_transformer = SimpleImputer(strategy='constant')

numerical_stategies = ['constant', 'mean', 'median', 'most_frequent']

#Preprocessing for categorical data

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frquent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_strategies = ['most_frequent']

#Bundle Preprocessing together => numerical and categorical 

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, low_cardinality_categorical_cols)

])

#Define Model(s)

#Models

#DecisionTreeRegressor
dtr1 = DecisionTreeRegressor(random_state=2)
dtr2 = DecisionTreeRegressor(max_leaf_nodes=20, random_state=2)
dtr3 = DecisionTreeRegressor(max_leaf_nodes=40, random_state=2)
dtr4 = DecisionTreeRegressor(max_leaf_nodes=60, random_state=2)
dtr5 = DecisionTreeRegressor(max_leaf_nodes=80, random_state=2)

#RandomForestRegressor
rfr1 = RandomForestRegressor(n_estimators=50, random_state=0)
rfr2 = RandomForestRegressor(n_estimators=100, random_state=0)
rfr3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
rfr4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
rfr5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

#xgboost 

xgb1 = XGBRegressor(random_state=0)

models = [xgb1, dtr1, dtr2, dtr3, dtr4 , dtr5 , rfr1, rfr2, rfr 3, rfr4, rfr5]

#Bundle Preprocessing to Model => Model and Preprocessing

pipieline1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dtr1)
])

pipieline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dtr2)
])

pipieline3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dtr3)
])

pipieline4 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dtr4)
])

pipieline5 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dtr5)
])






#Function for comparing different models
def score_model(my_pipeline, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    my_pipeline.fit(X_t, y_t)
    predictions = my_pipeline.predict(X_v)
    rms = mean_squared_error(y_v, predictions, squared= False)
    return rms

'''for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))'''







SyntaxError: invalid syntax (<ipython-input-109-4724b53932ea>, line 47)

In [110]:
#Select best approach

#Generate test predictions

#predictions_test = my_pipeline.predict(X_test)

# Approach 1a: Drop all categorical columns

#Code snippet
#dropna data=data.dropna(axis=0) which i think means rows

data_train_1a = data_train.copy()


#Gain y from train data
y_train_1a = data_train_1a.target

#features_nocat = [col for col in train_cols if 'cont' in col]
#print(features_nocat)

#Gain X from train data
X_train_1a = data_train_1a.drop(['target'], axis=1) #inlince keeps one copy and makes changes to it instead of making new copes

#alternatively
#train_cols.pop('target') => maybe this is a thing? Idk haven't checked
# X_train = data_train[train_cols]