In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.preprocessing import PolynomialFeatures

# (A)

In [2]:
df = pd.read_csv("./Iris-cleaned.csv")
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
features = df.drop(['petal length', 'petal width', 'species'],axis=1)
features.head()

Unnamed: 0,sepal length,sepal width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


In [4]:
target = df.species

In [5]:
logR=LogisticRegression()

In [6]:
logR.fit(features,target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
results = cross_validate(logR,features,target,return_train_score=True)

In [8]:
results

{'fit_time': array([0.00199938, 0.00099993, 0.00099921]),
 'score_time': array([0.04200578, 0.00099921, 0.00099945]),
 'test_score': array([0.78431373, 0.64705882, 0.8125    ]),
 'train_score': array([0.75757576, 0.74747475, 0.74509804])}

In [9]:
results['train_score'].mean()

0.7500495147553972

In [10]:
results['test_score'].mean()

0.7479575163398694

#### No evidence of overfitting, but there is evidence of underfitting, as the training scores to not get much higher than the test scores

## (B)

In [11]:
polys = PolynomialFeatures(2, interaction_only=True)
features_engineered = polys.fit_transform(features)
cols = polys.get_feature_names(features.columns)
features_engineered = pd.DataFrame(features_engineered, columns=cols)
features_engineered.head()

Unnamed: 0,1,sepal length,sepal width,sepal length sepal width
0,1.0,5.1,3.5,17.85
1,1.0,4.9,3.0,14.7
2,1.0,4.7,3.2,15.04
3,1.0,4.6,3.1,14.26
4,1.0,5.0,3.6,18.0


In [12]:
ix= (features_engineered.std()==0)
drop_cols = features_engineered.columns[ix]
features_engineered = features_engineered.drop(drop_cols,axis=1)
print(features_engineered.shape)

(150, 3)


In [13]:
features_engineered = (features_engineered - features_engineered.mean())/features_engineered.std()

In [14]:
results = cross_validate(logR,features_engineered,target,return_train_score=True)
R2_train = results['train_score'].mean()
R2_test = results['test_score'].mean()

In [15]:
R2_train.round(3)

0.797

In [16]:
R2_test.round(3)

0.799

In [17]:
logR.coef_

array([[-2.49579289,  4.01011301],
       [ 0.49709451, -1.63380222],
       [ 1.15921404, -1.77736568]])

In [18]:
logR.intercept_

array([ 0.81713932,  1.22543562, -2.22516119])