In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC as svm

## Data Preparation

### Load Datasets

In [2]:
current_directory = os.getcwd()
df_train = pd.read_csv(os.path.join(current_directory,"datasets/train.csv"))
df_test = pd.read_csv(os.path.join(current_directory,"datasets/test.csv"))

pixels = df_train.columns[1:]
X = df_train[pixels]

In [3]:
print(df_train.shape)
print(X.shape)
print(df_test.shape)

(42000, 785)
(42000, 784)
(28000, 784)


### Address NULL/Missing Values

In [3]:
X.isnull().any().describe()

count       784
unique        1
top       False
freq        784
dtype: object

In [7]:
df_test.isnull().any().describe()

count       784
unique        1
top       False
freq        784
dtype: object

### Normalization

In [None]:
X = X / 255.0
df_train = df_train / 255.0

### Reshape Data

In [None]:
# Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
#X = X.values.reshape(-1,28,28,1)
#df_test = df_test.values.reshape(-1,28,28,1)

In [8]:
# Set the random seed
random_seed = 4

In [9]:
y = df_train["label"]

# Split the train and the validation set for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=random_seed)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(37800, 784)
(4200, 784)
(37800,)
(4200,)


In [10]:
# Run Data Through Standarizer
standardized = StandardScaler().fit(X_train)
X_train = standardized.transform(X_train)
X_test = standardized.transform(X_test)


  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [11]:
# Dimensionality Reduction
pca = PCA().fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(37800, 784)
(4200, 784)
(37800,)
(4200,)


## Model Analysis (Support Vector Machine)

In [13]:
svc_classifier = svm(gamma = 0.1, kernel = "poly", random_state = 42)

### Build & Train Model ()

In [14]:
svc_classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='poly',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

### Evaluate Model

### Prediction

In [15]:
predictions = svc_classifier.predict(df_test)

In [19]:
predictions = pd.DataFrame(predictions)
predictions.head()

Unnamed: 0,0
0,2
1,2
2,6
3,6
4,1


In [68]:
submission = df_test["label"]

submission = pd.DataFrame(submission)
#submission.head()
print(submission.shape)

KeyError: 'label'

In [67]:
test = pd.DataFrame(predictions)
#test["PredictedNumber"] = predictions
print(test.shape)
#submission["PredictedNumber"] = predictions
#submission.columns = ["ImageId","Label"]
#submission1.describe()

(28000, 1)


In [61]:
submission.Label.astype('int64')

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [45]:
submission.to_csv(os.path.join(current_directory,"results.csv"), index=False)
print(submission.dtypes)

ImageId     object
Label      float64
dtype: object


In [44]:
sample_submission = pd.read_csv(os.path.join(current_directory,"datasets/sample_submission.csv"))
print(sample_submission.dtypes)
sample_submission.head()

ImageId    int64
Label      int64
dtype: object


Unnamed: 0,ImageId,Label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [70]:
print(df_test.columns)


Index(['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=784)
