In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

  from numpy.core.umath_tests import inner1d


In [2]:
training = pd.read_csv("train.csv")
testing = pd.read_csv("test.csv")
features = []

for i in range(784):
    s = "pixel" + str(i)
    features.append(s)

X_train = training[features]
y_train = training["label"]
X_test = testing[features]
training.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X_training, X_testing, y_training, y_testing = train_test_split(X_train, y_train, random_state=0)

### Logistic Regression

In [17]:
logreg = LogisticRegression(solver="sag", max_iter=25)
logreg.fit(X_training, y_training)
y_pred = logreg.predict(X_testing)
acc_logreg = accuracy_score(y_testing, y_pred)

print(acc_logreg)

0.9112380952380953


In [None]:
logreg = LogisticRegression(solver="sag", max_iter=20)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

submission = pd.DataFrame({
    "Label": y_pred
})
submission.index += 1

submission.to_csv("submission_logreg.csv", index_label="ImageId")
print(submission.shape)
print(submission.head())

### Stochastic Gradient Descent

In [15]:
sgd = SGDClassifier(loss='perceptron', n_jobs=3, shuffle=True)
sgd.fit(X_training, y_training)
y_pred = sgd.predict(X_testing)
acc_sgd = accuracy_score(y_testing, y_pred)

print(acc_sgd)

0.8864761904761905


In [16]:
sgd = SGDClassifier(loss='perceptron', n_jobs=3, shuffle=True)
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

submission = pd.DataFrame({
    "Label": y_pred
})
submission.index += 1

submission.to_csv("submission_sgd.csv", index_label="ImageId")
print(submission.shape)
print(submission.head())

(28000, 1)
   Label
1      2
2      0
3      9
4      9
5      3


### BernoulliNB Model

In [4]:
bnb = BernoulliNB()
bnb.fit(X_training, y_training)
y_pred = bnb.predict(X_testing)
acc_bnb = accuracy_score(y_testing, y_pred)

print(acc_bnb)

0.8382857142857143


In [12]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)

submission = pd.DataFrame({
    "Label": y_pred
})
submission.index += 1

submission.to_csv("submission_bnb.csv", index_label="ImageId")
print(submission.shape)
print(submission.head())

(28000, 1)
   Label
1      2
2      0
3      9
4      9
5      2


### Neural Network

In [7]:
nnc = MLPClassifier(hidden_layer_sizes=[800, 800], solver='sgd', shuffle=True, max_iter=20)
nnc.fit(X_training, y_training)
y_pred = nnc.predict(X_testing)
acc_nnc = accuracy_score(y_testing, y_pred)

print(acc_nnc)

0.9578095238095238


In [8]:
nnc = MLPClassifier(hidden_layer_sizes=[800, 800], solver='sgd', shuffle=True, max_iter=20)
nnc.fit(X_train, y_train)
y_pred = nnc.predict(X_test)

submission = pd.DataFrame({
    "Label": y_pred
})
submission.index += 1

submission.to_csv("submission_nnc.csv", index_label="ImageId")
print(submission.shape)
print(submission.head())

(28000, 1)
   Label
1      2
2      0
3      9
4      9
5      3
