In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC as svm

## Data Preparation

### Load Datasets

In [2]:
current_directory = os.getcwd()
df_train = pd.read_csv(os.path.join(current_directory,"datasets/train.csv"))
df_test = pd.read_csv(os.path.join(current_directory,"datasets/test.csv"))

pixels = df_train.columns[1:]
X = df_train[pixels]

In [3]:
print(df_train.shape)
print(X.shape)
print(df_test.shape)

(42000, 785)
(42000, 784)
(28000, 784)


In [4]:
df_train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
sample_submission = pd.read_csv(os.path.join(current_directory,"datasets/sample_submission.csv"))
print(sample_submission.dtypes)
sample_submission.head()

ImageId    int64
Label      int64
dtype: object


Unnamed: 0,ImageId,Label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


### Address NULL/Missing Values

In [8]:
X.isnull().any().describe()

count       784
unique        1
top       False
freq        784
dtype: object

In [9]:
df_test.isnull().any().describe()

count       784
unique        1
top       False
freq        784
dtype: object

### Normalization

In [10]:
# Set the random seed
random_seed = 4

In [11]:
y = df_train["label"]

# Split the train and the validation set for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=random_seed)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(37800, 784)
(4200, 784)
(37800,)
(4200,)


In [12]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()
y_train2 = y_train.copy()
y_test2 = y_test.copy()
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(37800, 784)
(4200, 784)
(37800,)
(4200,)


### Reshape Data

In [13]:
# Run Data Through Standarizer
standardized = StandardScaler().fit(X_train)
X_train = standardized.transform(X_train)
X_test = standardized.transform(X_test)


  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [14]:
# Dimensionality Reduction
pca = PCA().fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [15]:
print(X.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(42000, 784)
(4200, 784)
(37800,)
(4200,)


## Model Analysis (Support Vector Machine)

In [16]:
svc_classifier = svm(gamma = 0.1, kernel = "poly", random_state = 42)

In [17]:
svc_classifier2 = svm(gamma = 0.1, kernel = "poly", random_state = 42)

### Build & Train Model ()

In [18]:
svc_classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='poly',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
svc_classifier2.fit(X_train2, y_train2)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='poly',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

### Evaluate Model

### Prediction

In [20]:
predictions = svc_classifier.predict(df_test)
predictions = svc_classifier2.predict(df_test)

In [21]:
predictions1 = pd.DataFrame(predictions)
predictions.head()

Unnamed: 0,0
0,2
1,0
2,9
3,4
4,3


In [22]:
predictions2 = pd.DataFrame(predictions)
predictions2.head()

NameError: name 'predictions2' is not defined

In [23]:
submission = predictions.copy()
print(submission.shape)
submission["ImageId"] = predictions.index+1
submission.rename(columns={0:"Label"}, inplace=True)

#submission = pd.DataFrame(submission)
submission.head()

(28000, 1)


Unnamed: 0,Label,ImageId
0,2,1
1,0,2
2,9,3
3,4,4
4,3,5


In [24]:
submission.to_csv(os.path.join(current_directory,"results.csv"), index=False)

In [None]:
print(df_test.shape)


In [None]:
print(sample_submission.columns)
print(sample_submission.shape)

In [None]:
sample_submission.head(50)

In [None]:
submission["ImageId"] = df_test.index+1
submission
submission.head()