In [1]:
# Step 1, import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Import dataset
model_df = pd.read_pickle("processed_symptoms.pkl")
print("The shape of the dataset is: ", model_df.shape)
print("The coluumns of this model are: ", model_df.columns)

The shape of the dataset is:  (16917, 20)
The coluumns of this model are:  Index(['user_id', 'acne', 'backache', 'bloating', 'cramp', 'diarrhea', 'dizzy',
       'headache', 'mood', 'nausea', 'sore', 'date', 'cycle_length_initial',
       'period_length_initial', 'period_start_cycle_start', 'period_end',
       'prev_cycle_end_date', 'cycle_end_date', 'prev_cycle_start_date',
       'cycle_percentage'],
      dtype='object')


In [3]:
# Step 2, create predictor and target variables

X_cramp = model_df[['acne','backache', 'bloating', 'diarrhea', 'dizzy', 'headache', 'mood', 'nausea', 'sore', 'cycle_percentage']]
y_cramp = model_df.cramp.values

X_bloating = model_df[['acne','backache', 'cramp', 'diarrhea', 'dizzy', 'headache', 'mood', 'nausea', 'sore', 'cycle_percentage']]
y_bloating = model_df.bloating.values

X_mood = model_df[['acne','backache', 'bloating', 'diarrhea', 'dizzy', 'headache', 'cramp', 'nausea', 'sore', 'cycle_percentage']]
y_mood = model_df.mood.values


In [4]:
# Step 3, cross validate a linear regresion for comparison to the regression tree)

# cross val Linear Reg with 4 folds
reg_score1 = cross_val_score(LinearRegression(), X_cramp, y_cramp, cv=3)
reg_score2 = cross_val_score(LinearRegression(), X_bloating, y_bloating, cv=3)
reg_score3 = cross_val_score(LinearRegression(), X_mood, y_mood, cv=3)

#get scores
print("CV and mean scores for cramp linear regression: ", reg_score1, np.mean(reg_score1))
print("CV and mean scores for bloating linear regression: ", reg_score2, np.mean(reg_score2))
print("CV and mean scores for mood linear regression: ", reg_score3, np.mean(reg_score3))

CV and mean scores for cramp linear regression:  [0.26628763 0.25879024 0.25829416] 0.2611240102022768
CV and mean scores for bloating linear regression:  [0.26948945 0.25093623 0.23305846] 0.251161378647387
CV and mean scores for mood linear regression:  [0.18452939 0.21651262 0.17791852] 0.19298684060364749


In [5]:
# Step 4a: Perform Grid-Search on a Random Forest Regressor for cramps
gsc = GridSearchCV(
estimator=RandomForestRegressor(),
param_grid={
            'max_depth': (5,10, 15, 20),
            'n_estimators': (10, 50, 100, 200, 50),
        }, 
cv=3, scoring='r2', verbose=1, n_jobs=-1)
    
grid_result = gsc.fit(X_cramp, y_cramp)
best_params = grid_result.best_params_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.2min finished


In [6]:
# Step 4b: Perform Grid-Search on a Random Forest Regressor for cramps
gsc = GridSearchCV(
estimator=RandomForestRegressor(),
param_grid={
            'max_depth': (5,10, 15, 20),
            'n_estimators': (10, 50, 100, 200, 50),
        }, 
cv=3, scoring='r2', verbose=1, n_jobs=-1)
    
grid_result = gsc.fit(X_bloating, y_bloating)
best_params_bloating = grid_result.best_params_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   53.1s finished


In [7]:
# Step 4c: Perform Grid-Search on a Random Forest Regressor for cramps
gsc = GridSearchCV(
estimator=RandomForestRegressor(),
param_grid={
            'max_depth': (5,10, 15, 20),
            'n_estimators': (10, 50, 100, 200, 50),
        }, 
cv=3, scoring='r2', verbose=1, n_jobs=-1)
    
grid_result = gsc.fit(X_mood, y_mood)
best_params_mood = grid_result.best_params_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   57.0s finished


In [8]:
best_params_cramps = best_params
print(best_params_cramps)
print(best_params_bloating)
print(best_params_mood)

{'max_depth': 20, 'n_estimators': 100}
{'max_depth': 20, 'n_estimators': 200}
{'max_depth': 20, 'n_estimators': 200}


In [9]:
# Step 5a, create a random forest regressor with the best parameters
rfr = RandomForestRegressor(max_depth=best_params["max_depth"], 
                            n_estimators=best_params["n_estimators"],                               
                            random_state=42, 
                            verbose=True)

In [10]:
# Step 5b-1, perform K-Fold CV for cramps
scores_cramp = cross_val_score(rfr, X_cramp, y_cramp, cv=3, scoring='r2', n_jobs=-1)

In [11]:
# Step 5b-2, perform K-Fold CV for bloating
scores_bloating = cross_val_score(rfr, X_bloating, y_bloating, cv=3, scoring='r2', n_jobs=-1)

In [12]:
# Step 5b-3, perform K-Fold CV for bloating
scores_mood = cross_val_score(rfr, X_mood, y_mood, cv=3, scoring='r2', n_jobs=-1)

In [13]:
print("Scores for cramps: ", scores_cramp, np.mean(scores_cramp))
print("Scores for bloating: ", scores_bloating, np.mean(scores_bloating))
print("Scores for mood: ", scores_mood, np.mean(scores_mood))

Scores for cramps:  [0.36807113 0.38658725 0.42711262] 0.3939236663585486
Scores for bloating:  [0.44496839 0.42781231 0.53267052] 0.46848374109808044
Scores for mood:  [0.36147523 0.39104552 0.47365559] 0.4087254435566627


In [14]:
# Step 6, create predictions using model

# Create splits for cramps
X_train_cramp, X_test_cramp, y_train_cramp, y_test_cramp = train_test_split(X_cramp, y_cramp, test_size=0.2)

# Create splits for bloating
X_train_bloating, X_test_bloating, y_train_bloating, y_test_bloating = train_test_split(X_bloating, y_bloating, test_size=0.2)

# Create splits for mood
X_train_mood, X_test_mood, y_train_mood, y_test_mood = train_test_split(X_mood, y_mood, test_size=0.2)


In [15]:
# Fit and Predict cramps
rfr.fit(X_train_cramp, y_train_cramp)
y_pred_cramp = rfr.predict(X_test_cramp)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [16]:
# Fit and Predict bloating
rfr.fit(X_train_bloating, y_train_bloating)
y_pred_bloating = rfr.predict(X_test_bloating)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [17]:
# Fit and Predict mood
rfr.fit(X_train_mood, y_train_mood)
y_pred_mood = rfr.predict(X_test_mood)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [18]:
print("R-squared score for cramp prediction: ", r2_score(y_test_cramp, y_pred_cramp))
print("R-squared score for bloating prediction: ", r2_score(y_test_bloating, y_pred_bloating))
print("R-squared score for mood prediction: ", r2_score(y_test_mood, y_pred_mood))

R-squared score for cramp prediction:  0.37950536752347397
R-squared score for bloating prediction:  0.42888813422319194
R-squared score for mood prediction:  0.3979736649767188
