# Linear SVC Assignment

In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt


### Import the admissions data set (admissions.csv).

In [2]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv')
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


In [3]:
num_cols = ['GRE', 'TOEFL', 'SchoolRank', 'SOP', 'LOR', 'GPA']
bin_cols = ['Research']

### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [4]:
X = data.drop(columns='Admitted')
y = data['Admitted']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [7]:
model = SVC(C=10, kernel='linear')
model.fit(X_train, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
print(model.score(X_train, y_train))


0.8785714285714286


In [9]:
print(model.score(X_test, y_test))


0.8666666666666667


### Choose some other values for C and show the difference between the scores for the train and test sets.

In [10]:
model = SVC(C=0.1, kernel='linear')
model.fit(X_train, y_train)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
print(model.score(X_train, y_train))


0.8821428571428571


In [12]:
print(model.score(X_test, y_test))


0.8416666666666667


In [13]:
model = SVC(C=1, kernel='linear')
model.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
print(model.score(X_train, y_train))


0.8821428571428571


In [15]:
print(model.score(X_test, y_test))


0.8833333333333333


### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [22]:
num_cols = ['GRE', 'TOEFL', 'SOP', 'LOR', 'GPA']
bin_cols = ['Research', 'Admitted']

In [23]:
X = data.drop(columns='SchoolRank')
y = data['SchoolRank']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
preprocessing = ColumnTransformer(
    [('scale', StandardScaler(), num_cols)], remainder='passthrough')

In [25]:
pipeline = Pipeline([('preprocessing', preprocessing), ('svm', SVC(kernel='linear'))])

In [26]:
grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

pipeline_cv = GridSearchCV(pipeline, param_grid=grid)
pipeline_cv.fit(X_train, y_train)
pipeline_cv.best_params_

{'svm__C': 0.1}

In [27]:
pipeline_cv.score(X_train, y_train)

0.5892857142857143

In [28]:
pipeline_cv.score(X_test, y_test)

0.6083333333333333

### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [29]:
y_pred = pipeline_cv.predict(X_test)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.25      0.50      0.33         4
           2       0.70      0.55      0.61        42
           3       0.65      0.75      0.70        40
           4       0.47      0.57      0.52        14
           5       0.62      0.50      0.56        20

    accuracy                           0.61       120
   macro avg       0.54      0.57      0.54       120
weighted avg       0.63      0.61      0.61       120

