In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/congressional-voting-records-data-set/house-votes-84.data.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#shows the difference in voting behavior between the 
#two parties for the 'education-spending' bill, with each (class name)party colored differently.
plt.figure()
sns.countplot(x='education-spending', hue='Class Name', data=df, palette='RdBu')
plt.xticks([0,1], ['No', 'Yes'])
plt.show()

In [None]:
#to know which ones do Democrats vote resoundingly in favor of,
#compared to Republicans out of these bills(satellite and missile)? 
plt.figure()
sns.countplot(x='anti-satellite-test-ban',hue='Class Name',data=df, palette='RdBu')
plt.xticks([0,1],['No','Yes'])
plt.show()

In [None]:
plt.figure()
sns.countplot(x='mx-missile',hue='Class Name',data=df, palette='RdBu')
plt.xticks([0,1],['No','Yes'])
plt.show()

In [None]:
#importing KNeighbors classifier from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
df01=df.replace(['n','y','?'],[0,1,np.nan])

#missing values
df01.isnull().sum()
df01.head()

In [None]:
#missing values
print(df01.isnull().sum())
print(" \nTotal Missing values are : ", df01.isnull().sum().sum())

In [None]:
#create arrays for the features  and the response varaible
#Without using .values, X and y are a DataFrame and Series respectively; 
#use of the .values attribute to ensure X and y are NumPy arrays
y=df01["Class Name"].values                 #the output variable or the predictor
X=df01.drop("Class Name",axis=1).values    #Deleting from the dataset

imp=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imp.fit(X)
X=imp.transform(X)


#imputing with a pipline
#imp=SimpleImputer(missing_values=np.nan,strategy='mean')
knn=KNeighborsClassifier(n_neighbors=8)   #instantiate model
steps=[('imputation',imp),   #build a pipeline object (each last step must be an transformer and estimator)
       ('KNeighborsClassifier(n_neighbors=8)',knn)]  #each step is 2 tuple, containing the name for relevant step and estimator

In [None]:
#pass the steps to the pipeline Constructor
pipeline=Pipeline(steps)

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

#Create a KNN classifier with 6 Neighbors
#knn=KNeighborsClassifier(n_neighbors=8)


In [None]:
#fit the classifier to the data
pipeline.fit(X_train,y_train)
    
#predicting the labels for the tesring data
y_prediction=pipeline.predict(X_test)  #will generate 435 predictions, 1 for each sample
#y_prediction.replace(['democrat','republic'],[0,1])

#predict and print the label for the new data point X_new
#new_prediction=knn.predict(X_new)  #will generate 1 prediction
print("Prediction: {}".format(y_prediction))

In [None]:
#classification report
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print(classification_report(y_test,y_prediction,target_names=['democrat','republican']))
print(confusion_matrix(y_test,y_prediction))
#compute Accuracy
print(accuracy_score(y_test, y_prediction))

In [None]:
#Logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_confusion_matrix
#initiate model
logreg=LogisticRegression(solver='lbfgs',max_iter=1000)


In [None]:
logreg.fit(X_train,y_train)

# predict probabilities
# keep probabilities for the positive outcome only =[:,1]
#we choose second column that si the probabilites of predicted label being 1
y_pred_logR=logreg.predict_proba(X_test)[:,1]
#proba which returns the probability of a given sample being in a particular class.

In [None]:
#when threshold =0, model prdicts 1 for all data(tpr=fpr=1)
#when threshold =1, model prdicts 0 for all data(tpr=fpr=0)
#But if we vary threshold between these two extremes ,we get series of  diff tpr and fpr
#the set points we get is called ROC curve
#here we used predicted probalites  of model assiging value =1 to obervation in quesio
#becauew to get both the probolatiy of log reg and prediitons , so we do predict proba- retunrs two arrays
fpr,tpr,thresholds=roc_curve(y_test, y_pred_logR,pos_label=True)

plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression Logistic ROC curve')
plt.show()

In [None]:
from sklearn import metrics
displ = metrics.plot_confusion_matrix(pipeline, X_test, y_test)
displ.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{displ.confusion_matrix}")

plt.show()

In [None]:
# Import  modules
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# Compute predicted probabilities: y_pred_prob
y_pred_logR = logreg.predict_proba(X_test)[:,1]

# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test,y_pred_logR)))

# Compute cross-validated AUC scores: cv_auc
cross_valid_auc = cross_val_score(logreg,X,y,cv=5,scoring='roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cross_valid_auc))


In [None]:

from sklearn.model_selection import GridSearchCV

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space} #grid as dictionary

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg,param_grid, cv=5)  

#also RandomizedSearchCV will save on computational time but never outperform Gridserach


logreg_cv.fit(X_train,y_train)
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))


In [None]:

#under Progress hypertuining and elastic regression

#regularized regression = Elastic regression
from sklearn.linear_model import ElasticNet 
from sklearn.metrics import mean_squared_error


# Create the hyperparameter grid
l1_space = np.linspace(0, 1, 30)
param_grid = {'l1_ratio':l1_space }

# Instantiate the ElasticNet regressor: elastic_net
elastic_net = ElasticNet()

# Setup the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(elastic_net,param_grid,cv=5)

# Fit it to the training data
gm_cv.fit(X_train,y_train)

# Predict on the test set and compute metrics
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test,y_pred)
print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))
