# Using Examples from Class for most of this

In [153]:
# Widget to manipulate plots in Jupyter notebooks
%matplotlib widget

import matplotlib.pyplot as plt # For general plotting
from matplotlib import cm
import pandas as pd

from math import ceil, floor

import numpy as np
import seaborn as sns
from scipy.stats import norm, multivariate_normal

np.set_printoptions(suppress=True)

plt.rc('font', size=22)          # controls default text sizes
plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=18)     # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)    # fontsize of the tick labels
plt.rc('ytick', labelsize=14)    # fontsize of the tick labels
plt.rc('legend', fontsize=16)    # legend fontsize
plt.rc('figure', titlesize=22)   # fontsize of the figure title

# Import dataset

In [154]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Dataset

demographics_df = pd.read_csv("demographics.csv")
features = [
    "pop",
    "hispanic_or_latino",
    "white",
    "black",
    "native_american",
    "asian",
    "nhpi",
    "other",
    "two_or_more",
    "sex_ratio",
    "bachelors",
    "income",
    "foreign_born",
    "age",
]

X = demographics_df[features]
y = demographics_df["margin"]

X['pop'] = X['pop'].str.replace(',','')
X['income'] = X['income'].str.replace(',','')

# Tread un-reported income as 0, can choose mean for state or something instead later?
X['income'] = X['income'].str.replace('-', '0.0')

X_standardized = StandardScaler().fit_transform(X)

y_binary = [0 if l < 0 else 1 for l in y]
print(y_binary)

X_train, X_test, y_train, y_test = train_test_split(X_standardized, y_binary, test_size=0.15, random_state=0)

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['pop'] = X['pop'].str.replace(',','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['income'] = X['income'].str.replace(',','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['income'] = X['income'].str.replace('-', '0.0')


In [155]:
# Some stats about the dataset (out of curiosity)
print("N samples: {}".format(len(X)))
print("N Training Samples: {}".format(len(X_train)))
print("N Test Samples: {}".format(len(X_test)))


N samples: 3152
N Training Samples: 2679
N Test Samples: 473


In [156]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV()

lr.fit(X_train, y_train)
y_preds_test = lr.predict(X_test)

score = lr.score(X_test, y_test)
print(score)

0.9006342494714588


In [157]:
# Prep for continuous regression to predict margin
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.15, random_state=0)

# SKLearn Linear Regression

In [158]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

linreg.score(X_test, y_test)
linpreds = linreg.predict(X_test)
# print(linpreds)

binary_lin_preds = []

for pred in linpreds:
    if pred < 0:
        binary_lin_preds.append(0)
    else:
        binary_lin_preds.append(1)

In [159]:
# Compute binary classification error using margin predictions
#  ie even if margin is incorrect, is the binary choice (trump v biden) correct

err_sum = 0
for i in range(len(binary_lin_preds)):
    if binary_lin_preds[i] != y_preds_test[i]:
        err_sum += 1

err = err_sum / len(binary_lin_preds)
print(err)

0.03382663847780127
