In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

%matplotlib inline


In [2]:
powerlifting = pd.read_pickle('./powerlifting_clean.p')

In [5]:
mask = ((powerlifting['Squat1Kg'].isnull()==False) & (powerlifting['Squat2Kg'].isnull()==False) & \
        (powerlifting['Squat3Kg'].isnull()==False)) | ((powerlifting['Bench1Kg'].isnull()==False) & \
        (powerlifting['Bench2Kg'].isnull()==False) & (powerlifting['Bench3Kg'].isnull()==False)) | \
        ((powerlifting['Deadlift1Kg'].isnull()==False) & (powerlifting['Deadlift2Kg'].isnull()==False) & \
         (powerlifting['Deadlift3Kg'].isnull()==False))

In [6]:
powerlifting = powerlifting[mask]

In [11]:
def success_or_fail(val):
    if val > 0:
        return 1
    if val <= 0:
        return 0
    else:
        return np.nan

In [12]:
powerlifting['SuccessSquat1'] = powerlifting['Squat1Kg'].apply(success_or_fail)

In [14]:
lifts = ['Squat2Kg', 'Squat3Kg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg']

for col in lifts:
    title = 'Success{}'.format(col)
    powerlifting[title] = powerlifting[col].apply(success_or_fail)

In [20]:
lifts = ['Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg']

for col in lifts:
    title = 'Attempt{}'.format(col)
    powerlifting[title] = powerlifting[col].apply(abs)

In [22]:
powerlifting['EquipmentBinary'] = powerlifting.Equipment.apply(lambda x: 0 if x == 'Raw' else 1)
powerlifting['SexBinary'] = powerlifting.Sex.apply(lambda x: 1 if x == 'F' else 0)

In [49]:
experiment = powerlifting[(powerlifting['SuccessSquat1'].isnull()==False) & (powerlifting['AttemptSquat2Kg'].isnull()==False)]

In [50]:
experiment = experiment[powerlifting['Age'].isnull()==False]

  """Entry point for launching an IPython kernel.


In [119]:
X = experiment[['SuccessSquat1', 'AttemptSquat1Kg', 'AttemptSquat2Kg', 'BodyweightKg', 'Age', 'EquipmentBinary', 'SexBinary']]
y = experiment['SuccessSquat2Kg']

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
lg = LogisticRegression()
lg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [121]:
lg.score(X_test, y_test)

0.79080824088748014

In [122]:
from sklearn.metrics import classification_report, confusion_matrix

In [123]:
preds = lg.predict(X_test)
print (classification_report(y_test, preds))

             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00      1847
        1.0       0.79      1.00      0.88      6987

avg / total       0.63      0.79      0.70      8834



In [124]:
confusion_matrix(y_test, preds)

array([[   0, 1847],
       [   1, 6986]])

In [125]:
probas = lg.predict_proba(X_test)

In [153]:
print (probas[:, 0].max())
print (probas[: , 1].min())

0.507529993928
0.492470006072


In [127]:
rfc = RandomForestClassifier(n_estimators = 50)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [128]:
rfc.score(X_test, y_test)

0.78028073352954497

In [129]:
rfc_preds = rfc.predict(X_test)
print (classification_report(y_test, rfc_preds))

             precision    recall  f1-score   support

        0.0       0.43      0.16      0.23      1847
        1.0       0.81      0.94      0.87      6987

avg / total       0.73      0.78      0.74      8834



In [130]:
rfc_probas = rfc.predict_proba(X_test)

In [131]:
print (rfc_probas[:, 0].mean())
print (rfc_probas[: , 1].mean())

0.203741103097
0.796258896903


In [132]:
confusion_matrix(y_test, rfc_preds)

array([[ 292, 1555],
       [ 386, 6601]])

In [133]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [134]:
ada.score(X_train, y_train)

0.7999924525453791

In [135]:
ada_preds = ada.predict(X_test)
ada_probas = ada.predict_proba(X_test)

In [136]:
print (ada_probas[:, 0].mean())
print (ada_probas[: , 1].mean())

0.49258524851
0.50741475149


In [137]:
print (classification_report(y_test, ada_preds))

             precision    recall  f1-score   support

        0.0       0.30      0.00      0.01      1847
        1.0       0.79      1.00      0.88      6987

avg / total       0.69      0.79      0.70      8834



In [138]:
confusion_matrix(y_test, ada_preds)

array([[   6, 1841],
       [  14, 6973]])

In [139]:
experiment2 = powerlifting[(powerlifting['SuccessSquat2Kg'].isnull()==False) & \
                           (powerlifting['AttemptSquat3Kg'].isnull()==False)]
experiment2 = experiment2[powerlifting['Age'].isnull()==False]

  


In [140]:
X2 = experiment2[['SuccessSquat2Kg', 'AttemptSquat2Kg', 'AttemptSquat3Kg', 'BodyweightKg', 'Age', 'EquipmentBinary', 'SexBinary']]
y2 = experiment2['SuccessSquat3Kg']

In [141]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state = 42)

In [142]:
rfc2 = RandomForestClassifier()
rfc2.fit(X2_train, y2_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [143]:
rfc2.score(X2_test, y2_test)

0.60383203304268018

In [144]:
rfc2_preds = rfc2.predict(X2_test)
rfc2_probas = rfc2.predict_proba(X2_test)

In [145]:
print(classification_report(y2_test, rfc2_preds))

             precision    recall  f1-score   support

        0.0       0.49      0.48      0.48      3358
        1.0       0.68      0.68      0.68      5358

avg / total       0.60      0.60      0.60      8716



In [146]:
confusion_matrix(y2_test, rfc2_preds)

array([[1608, 1750],
       [1703, 3655]])

In [147]:
print (rfc2_probas[:, 0].mean())
print (rfc2_probas[: , 1].mean())

0.390636951201
0.609363048799


In [148]:
ada2 = AdaBoostClassifier()
ada2.fit(X2_train, y2_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [149]:
ada2.score(X2_test, y2_test)

0.6210417622762735

In [150]:
ada2_preds = ada2.predict(X2_test)
ada2_probas = ada2.predict_proba(X2_test)

In [151]:
print(classification_report(y2_test, ada2_preds))

             precision    recall  f1-score   support

        0.0       0.52      0.17      0.26      3358
        1.0       0.63      0.90      0.75      5358

avg / total       0.59      0.62      0.56      8716



In [155]:
confusion_matrix(y2_test, ada2_preds)

array([[ 579, 2779],
       [ 524, 4834]])