In [79]:
import pandas as pd
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Import DataSet

In [3]:
order_products_prior    = pd.read_csv('../datas/order_products__prior.csv')
aisles_df               = pd.read_csv('../datas/aisles.csv')
departments_df          = pd.read_csv('../datas/departments.csv')
orders_df               = pd.read_csv('../datas/orders.csv')
products_df             = pd.read_csv('../datas/products.csv')
order_products_train    = pd.read_csv('../datas/order_products__train_cap.csv')

In [92]:
user_feautres           = pd.read_csv('../datas/user_features.csv')

This will be my training data. I will merge the product_df and departments_df and aisles_df's Ids into my final_train. Then I will perform a Train_Test_split 

In [93]:
final_train = order_products_train.merge(products_df, how='left', on='product_id')
final_train = final_train.merge(departments_df, how='left', on='department_id')
final_train = final_train.merge(aisles_df, how='left', on='aisle_id')
final_train = final_train.merge(orders_df, how='left', on='order_id')
final_train = final_train.merge(user_feautres, how='left', on='user_id')
final_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,department,aisle,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,total_order,total_product,days_since_last_order,dow_average
0,36,39612,1,0,Grated Pecorino Romano Cheese,2,16,dairy eggs,specialty cheeses,79431,train,23,6,18,30.0,22,187,15.0,3.181818
1,36,19660,2,1,Spring Water,115,7,beverages,water seltzer sparkling water,79431,train,23,6,18,30.0,22,187,15.0,3.181818
2,36,49235,3,0,Organic Half & Half,53,16,dairy eggs,cream,79431,train,23,6,18,30.0,22,187,15.0,3.181818
3,36,43086,4,1,Super Greens Salad,123,4,produce,packaged vegetables fruits,79431,train,23,6,18,30.0,22,187,15.0,3.181818
4,36,46620,5,1,Cage Free Extra Large Grade AA Eggs,86,16,dairy eggs,eggs,79431,train,23,6,18,30.0,22,187,15.0,3.181818


Asigning my X and y variable

In [106]:
X = final_train.drop(columns=['reordered'
                              ,'product_name'
                              ,'department'
                              ,'aisle'
                              ,'eval_set'])
y = final_train['reordered']

Before performing train_test_split, it is crucial to check the rows are matched

In [107]:
X.shape

(1038515, 14)

In [108]:
final_train.shape

(1038515, 19)

## Baseline Model
**What is this?**
- This is our baseline model. The 1's represent there are 59.8861% of which means reordered (positive class) and 40.1139% of didn't reordered (negative class) in our dataset. This is good, because this tells us that our model is pretty evenly distributed.  Our base goal is to fit our data into several models and get the accuracy score that is higher than **59.8861%**. In that case, we will know we are doing better than the baseline.

In [109]:
y.value_counts(normalize=True)

1    0.598861
0    0.401139
Name: reordered, dtype: float64

## Train_test_split

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Machine Learning Model: Logistic Regression
---


**Why/why not this model?**
- I've used Grid Search to perform different parameter of C, and in result 1.0 gave the best result. 
- The parameter C is the the inverse of regularization strength in Logistic Regression. 
- The reason why I choose this model as my final, our expectation for our model is high accuracy. In this case the Logistic Regression's result gave us an accuracy of **64.189%**, which is higher than Random Forest Classifier's accuracy. 
- Not only the accuracy is higher than Random Forest Classifier, but also performed better than our Baseline model.


In [111]:
params = {'C': [1.0, 2.0, 3.0, 4.0]}

In [112]:
start_time = time.time()
gs_lr = GridSearchCV(LogisticRegression(random_state=42, solver='lbfgs'), 
                  param_grid=params, 
                  return_train_score=False,
                  cv=5)
gs_lr.fit(X_train, y_train.values.ravel())
print(f'Time Spent: {round((time.time()-start_time))} Seconds')

Time Spent: 49 Seconds


In [113]:
gs_lr.best_params_

{'C': 1.0}

In [114]:
gs_lr.score(X_train, y_train)

0.6418988144606528

In [115]:
gs_lr.score(X_test, y_test)

0.6425245253804467

In [116]:
predictions_lr = gs_lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, predictions_lr).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 42939
False Positives: 60947
False Negatives: 31864
True Positives: 123879


In [130]:
predictions = gs_lr.predict(X)

In [131]:
f1_score(y, predictions)

0.7268670062726444

## Machine Learning Model: Random Forest Classifier
---

**Why/why not this model?**
- I've used Grid Search to perform different parameter of Min_sample_split, and max_depth in result it gave the best result. of max_depth of None and min_samples_split of 1.0, Due to the compute time, I can only minimize to 2 parameters. With more time and compute power, I am sure I can find a higher accuracy.
- The reason why I did not choose this model as my final, because to be able to interpret if a model is good the accuracy if a key, our expectation for our model is high accuracy. In this case the Random Forest Classifier's result gave us an accuracy of **59.825%**, which is lower than Logistic Regression's accuracy.


In [124]:
params = {
    'min_samples_split': [1.0,0.5],
    'max_depth': [None,1]
}

In [125]:
start_time = time.time()
gs = GridSearchCV(RandomForestClassifier(n_estimators=100), 
                  param_grid=params, 
                  return_train_score=False,
                  cv=3)
gs.fit(X_train, y_train.values.ravel())
print(f'Time Spent: {round((time.time()-start_time))} Seconds')

Time Spent: 137 Seconds


In [126]:
gs.score(X_train, y_train.values.ravel())

0.598525329765845

In [127]:
gs.score(X_test, y_test)

0.599867503245015

In [128]:
gs.best_params_

{'max_depth': None, 'min_samples_split': 1.0}