In [1]:
import numpy as np
import pandas as pd
import scipy.stats

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import normalize, scale, Normalizer, StandardScaler, OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingRegressor, StackingRegressor

from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import precision_score, f1_score, accuracy_score, roc_curve, roc_auc_score, make_scorer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

from sklearn.svm import SVR, SVC
from sklearn.datasets import make_blobs
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.metrics import roc_curve, auc, roc_auc_score

from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

from statistics import mean



In [2]:
# load in my data:
data = pd.read_csv("2018-19 asco and weather data for modeling.csv", index_col = 0)

In [3]:
data.select_dtypes('object')

Unnamed: 0,SamplerNo,Location.x,SamplerType,qPCR_Plate,Date,SampleID,Censored,Water.,Location.y
1,01,1917,Burkard,Plate 09,2019-07-08,T1-189,False,No,
2,01,1917,Burkard,Plate 09,2019-07-09,T1-190,False,No,1917
3,01,1917,Burkard,Plate 09,2019-07-10,T1-191,True,Yes,1917
4,01,1917,Burkard,Plate 09,2019-07-11,T1-192,False,No,1917
5,01,1917,Burkard,Plate 01,2019-07-12,T1-193,False,No,1917
...,...,...,...,...,...,...,...,...,...
564,Taber,Taber,Burkard,,2018-08-24,,,,Taber
565,Taber,Taber,Burkard,,2018-08-25,,,,Taber
566,Taber,Taber,Burkard,,2018-08-26,,,,Taber
567,Taber,Taber,Burkard,,2018-08-27,,,,Taber


In [7]:
data.shape

(568, 98)

In [8]:
data.head()

Unnamed: 0,SamplerNo,Location.x,SamplerType,ExtractionGroup,qPCR_Plate,JDay,Date,SampleID,SsCtMean,SsCtSD,...,MinDP_1d,MeanSoilTemp,DiffMeanSoilT,MeanSoilT_1d,MaxSoilTemp,DiffMaxSoilT,MaxSoilT_1d,MinSoilTemp,DiffMinSoilT,MinSoilT_1d
1,1,1917,Burkard,,Plate 09,189,2019-07-08,T1-189,160.0,0.14,...,,,,,,,,,,
2,1,1917,Burkard,,Plate 09,190,2019-07-09,T1-190,327.0,0.43,...,,20.62917,,,27.1,,,13.7,,
3,1,1917,Burkard,,Plate 09,191,2019-07-10,T1-191,380.0,0.17,...,6.9,21.70833,1.07917,20.62917,28.2,1.1,27.1,15.4,1.7,13.7
4,1,1917,Burkard,,Plate 09,192,2019-07-11,T1-192,209.0,0.36,...,7.6,19.81667,-1.89167,21.70833,24.3,-3.9,28.2,15.3,-0.1,15.4
5,1,1917,Burkard,,Plate 01,193,2019-07-12,T1-193,152.0,0.06,...,8.5,20.40833,0.59167,19.81667,25.7,1.4,24.3,14.5,-0.8,15.3


In [170]:
data['qPCR_Plate'].unique()

array(['Plate 09', 'Plate 01', 'Plate 10', 'Plate 16', nan, 'Plate 21',
       'Plate 03', 'Plate 04', 'Plate 18 redone', 'Plate 13', 'Plate 17',
       'Plate 22', 'Plate 23', 'Plate 15', 'Plate 07', 'Plate 20'],
      dtype=object)

In [169]:
data['SsMean']

1        317.430
2         67.090
3         35.450
4        210.560
5        313.940
         ...    
564     3306.850
565    16854.055
566     1646.950
567     2692.505
568    13539.785
Name: SsMean, Length: 545, dtype: float64

In [10]:
data.columns

Index(['SamplerNo', 'Location.x', 'SamplerType', 'ExtractionGroup',
       'qPCR_Plate', 'JDay', 'Date', 'SampleID', 'SsCtMean', 'SsCtSD',
       'SsMean', 'logSsMean', 'Censored', 'SsSD', 'SsScaled', 'SsClean',
       'logSsClean', 'VolumeSampled', 'sporesPCM', 'logSPCM', 'TtCt', 'TtSD',
       'Water.', 'logSsMean_t1', 'Location.y', 'Year', 'MeanWetness',
       'DiffMeanWet', 'MeanWet_1d', 'MaxWetness', 'DiffMaxWet', 'MaxWet_1d',
       'MinWetness', 'DiffMinWet', 'MinWet_1d', 'MeanTemp', 'DiffMeanT',
       'MeanT_1d', 'MaxTemp', 'DiffMaxT', 'MaxT_1d', 'MinTemp', 'DiffMinT',
       'MinT_1d', 'MeanRH', 'DiffMeanRH', 'MeanRH_1d', 'MaxRH', 'DiffMaxRH',
       'MaxRH_1d', 'MinRH', 'DiffMinRH', 'MinRH_1d', 'DiffRH_0d', 'DiffRH_1d',
       'DiffRH_2d', 'MaxDiffRH_2h', 'MaxDiffRH_3h', 'MeanVPD', 'DiffMeanVPD',
       'MeanVPD_1d', 'MaxVPD', 'DiffMaxVPD', 'MaxVPD_1d', 'MinVPD',
       'DiffMinVPD', 'TotalPrecip', 'Precip_1d', 'RainYN', 'MaxRain',
       'MinRain', 'MeanWC', 'DiffMeanWC', 

In [11]:
data[['SsMean', 'sporesPCM']]

Unnamed: 0,SsMean,sporesPCM
1,317.430,13.23
2,67.090,2.80
3,35.450,0.00
4,210.560,8.77
5,313.940,13.08
...,...,...
564,3306.850,
565,16854.055,
566,1646.950,
567,2692.505,


In [89]:
cont = data.select_dtypes(['int','float'])

In [90]:
percent_missing = cont.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': cont.columns,
                                 'percent_missing': percent_missing})

In [65]:
missing_value_df.sort_values('percent_missing', ascending=False).head(30)

Unnamed: 0,column_name,percent_missing
ExtractionGroup,ExtractionGroup,100.0
SsScaled,SsScaled,100.0
DiffMinSoilT,DiffMinSoilT,58.626761
DiffMaxSoilT,DiffMaxSoilT,58.626761
DiffMeanSoilT,DiffMeanSoilT,58.626761
MaxSoilT_1d,MaxSoilT_1d,58.274648
MeanSoilT_1d,MeanSoilT_1d,58.274648
MinSoilT_1d,MinSoilT_1d,58.274648
MinSoilTemp,MinSoilTemp,57.570423
MeanSoilTemp,MeanSoilTemp,57.570423


In [91]:
# cols with greater than 50% missing values
miss50 = missing_value_df[missing_value_df['percent_missing'] > 50]

In [67]:
miss50.index

Index(['ExtractionGroup', 'SsCtMean', 'SsCtSD', 'SsSD', 'SsScaled', 'SsClean',
       'logSsClean', 'TtCt', 'TtSD', 'MeanSoilTemp', 'DiffMeanSoilT',
       'MeanSoilT_1d', 'MaxSoilTemp', 'DiffMaxSoilT', 'MaxSoilT_1d',
       'MinSoilTemp', 'DiffMinSoilT', 'MinSoilT_1d'],
      dtype='object')

In [93]:
cont.dropna(axis=0, subset=['SsMean'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cont.dropna(axis=0, subset=['SsMean'],inplace=True)


In [128]:
def create_cat(data):
    if data < 500:
        return 0
    else:
        return 1

In [129]:
cont['label'] = cont['SsMean'].apply(create_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cont['label'] = cont['SsMean'].apply(create_cat)


In [144]:
# try balanced dataset
class_0 = cont.loc[cont['label']==0]
class_1 = cont.loc[cont['label']==1]

In [145]:
class_0.shape

(377, 90)

In [154]:
class_1_sample = class_1.sample(n=377, replace=True)

In [155]:
class_1_sample.shape

(377, 90)

In [156]:
from sklearn.utils import shuffle
balanced_df = pd.concat([class_0, class_1_sample], axis=0)
balanced_df = shuffle(balanced_df)

In [157]:
balanced_df.shape

(754, 90)

In [158]:
X = balanced_df.drop(labels=['SsMean','label'], axis=1)
y = balanced_df['label']

In [159]:
X.shape

(754, 88)

In [160]:
X.drop(labels=miss50.index, axis=1, inplace=True)

In [161]:
X.shape

(754, 70)

In [162]:
y.shape

(754,)

In [88]:
y.isnull().sum()

568

In [163]:
# fill with mean....just for experiementation purposes
for col in X.columns:
    X[col] = X.fillna(value=X[col].mean())

In [164]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [165]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=233)

In [166]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
model = lr.fit(X_train,y_train)

In [167]:
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)

0.5947136563876652

In [172]:
from sklearn.metrics import roc_auc_score

probs = lr.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test,probs)
auc

0.6402192505053647

In [168]:
from sklearn.metrics import classification_report

cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.61      0.59      0.60       118
           1       0.58      0.60      0.59       109

    accuracy                           0.59       227
   macro avg       0.59      0.59      0.59       227
weighted avg       0.60      0.59      0.59       227



In [117]:
X.shape

(545, 70)

In [121]:
kbest = SelectKBest(k=20)
kbest = kbest.fit(X_train,y_train)
X2_train = kbest.transform(X_train)
X2_test = kbest.transform(X_test)

In [122]:
X2_train.shape

(381, 20)

In [126]:
X2_test.shape

(164, 20)

In [110]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

scaler = StandardScaler()

# create pca and kbest and merge with featureunion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

pca = PCA(n_components=3)
kbest = SelectKBest(k=3)

feature_union = FeatureUnion([
    ('pca', pca),
    ('select_best', kbest)
])

from sklearn.svm import SVR

model = SVR()

pipeline = Pipeline(
    steps=[
    ('scaler', scaler),
    ('features', feature_union),
    ('classifier', model)
])

In [111]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('features',
                 FeatureUnion(transformer_list=[('pca', PCA(n_components=3)),
                                                ('select_best',
                                                 SelectKBest(k=3))])),
                ('classifier', SVR())])

In [112]:
y_pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)

-0.07526561955881239

In [10]:
y.loc[y.values > 1000].count()

98

In [11]:
cont = X.select_dtypes(['float','int'])

In [12]:
cont.shape

(568, 88)

In [14]:
cont.isnull().sum().sort_values(ascending=False).head(20)

ExtractionGroup    568
SsScaled           568
DiffMinSoilT       333
DiffMaxSoilT       333
DiffMeanSoilT      333
MeanSoilT_1d       331
MinSoilT_1d        331
MaxSoilT_1d        331
MaxSoilTemp        327
MinSoilTemp        327
MeanSoilTemp       327
SsCtMean           297
TtCt               297
SsCtSD             297
SsSD               297
SsClean            297
TtSD               297
logSsClean         297
logSPCM            247
sporesPCM          247
dtype: int64