In [38]:
import numpy as np
import pandas as pd
import competition_helpers
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [39]:
# I/O configuration here
X_train = competition_helpers.read_csv("train_features.csv")
y_train = competition_helpers.read_csv("train_label.csv", remove_header=True)
X_test = competition_helpers.read_csv("test_features.csv")
submission_col = np.array(pd.read_csv("test_features.csv", header=None).iloc[: , 0]).ravel()
submission_file_name = "results/logisticregression_standardized_submission.csv"

print(X_train.shape, y_train.shape, X_test.shape)

(418, 100) (418, 1) (378, 100)


In [25]:
# 5 fold cross validation
train_test_split = competition_helpers.kfold_stratified_split(X_train, y_train, 5,False)
# With standardization
standardized_train_test_split = competition_helpers.kfold_stratified_split(X_train, y_train, 5,True)

In [26]:
# 5 fold train test split results
results = []
for [(X_train_cv, y_train_cv), (X_test_cv, y_test_cv)] in train_test_split:
    
    clf = LogisticRegression(random_state=0, solver='saga',max_iter = 2000,multi_class='auto')
    clf.fit(X_train_cv, y_train_cv.ravel())  
    prediction = clf.predict(X_test_cv)
    
    accuracy = accuracy_score(y_test_cv.ravel(), prediction.ravel())
    precision = precision_score(y_test_cv.ravel(), prediction.ravel())
    recall = recall_score(y_test_cv.ravel(), prediction.ravel())
    f1 = f1_score(y_test_cv.ravel(), prediction.ravel())
    
    results.append([accuracy, precision, recall, f1])
    

measures = np.sum(np.array(results), axis=0) / len(results) 
print(measures)

# With standardization

results = []
for [(X_train_cv, y_train_cv), (X_test_cv, y_test_cv)] in standardized_train_test_split:
    
    clf = LogisticRegression(random_state=0, solver='saga',max_iter = 2000,multi_class='auto')
    clf.fit(X_train_cv, y_train_cv.ravel())  
    prediction = clf.predict(X_test_cv)
    
    accuracy = accuracy_score(y_test_cv.ravel(), prediction.ravel())
    precision = precision_score(y_test_cv.ravel(), prediction.ravel())
    recall = recall_score(y_test_cv.ravel(), prediction.ravel())
    f1 = f1_score(y_test_cv.ravel(), prediction.ravel())
    
    results.append([accuracy, precision, recall, f1])
    

measures = np.sum(np.array(results), axis=0) / len(results) 
print(measures)



[0.66750565 0.84921412 0.67219662 0.75003226]
[0.83022443 0.87927443 0.89713262 0.88717683]


In [40]:
# fitting the test dataset
clf = LogisticRegression(random_state=0, solver='saga',max_iter = 2000,multi_class='auto')
X_train_standard,X_test_standard = competition_helpers.standardize_data(X_train,X_test)
clf.fit(X_train, y_train.ravel())  
prediction = clf.predict(X_test_standard)



In [41]:
pd.DataFrame({"id": submission_col, "label": prediction}).to_csv(submission_file_name, encoding='utf-8', index=False)