In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv('../input/us-elections-dataset/usa-2016-presidential-election-by-county.csv', sep = ';')

In [None]:
df

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df

In [None]:
df.Votes.sum() # total votes casted in all states

In [None]:
df.County.nunique()  # total no of Counties

In [None]:
df[['Democrats 2016', 'Republicans 2016']].sum() # total votes by party

In [None]:
df.groupby('Votes')['County'].value_counts().sort_values(ascending = False).head()

In [None]:
df.groupby('Votes')['State'].value_counts().sort_values(ascending = False).head()

In [None]:
df.groupby(['Votes','State'])['Republicans 08 (Votes)'].max().sort_values(ascending = False).head()

In [None]:
df.groupby(['Votes','State'])['Democrats 08 (Votes)'].min().sort_values(ascending = False).head()

In [None]:
sns.distplot(df['Democrats 2016'].dropna(), kde=False, bins=20)

In [None]:
import cufflinks as cf

In [None]:
cf.go_offline()

In [None]:
df['County'].dropna().iplot(kind='bar',)

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

## Cleaning of Data

In [None]:
df.isnull().sum().sort_values(ascending= False)

In [None]:
df.isna().sum().count()

In [None]:
df.shape

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)

In [None]:
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

In [None]:
missing_value_df.sort_values('percent_missing', inplace=True, ascending = False)

In [None]:
missing_value_df

In [None]:
df.dropna(axis=1,thresh=0.7*len(df), inplace=True) # using a thresh function to get certain range of values to drop

In [None]:
df.isnull().sum().max()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.fillna(method='ffill', inplace=True )

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

In [None]:
df

### Carrying out Recursive Feature Elimination (Feature selection)

In [None]:
correlated_features = set()
correlation_matrix = df.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [None]:
correlated_features

In [None]:
df = df[['Asian','At Least High School Diploma','Black','Child.Poverty.living.in.families.below.the.poverty.line',
        'Democrats 08 (Votes)','Democrats 12 (Votes)','Democrats 2008','Democrats 2012','Graduate Degree',
        'Nearest County','Poverty.Rate.below.federal.poverty.threshold','Republicans 08 (Votes)',
         'Republicans 12 (Votes)','Republicans 2008','Republicans 2012','Total Population','Votes',
        'White','White  Asian','total08','total12','total16', 'Democrats 2016', 'Republicans 2016']]

In [None]:
df = df.dropna()

In [None]:
df.head()

In [None]:
features = df.drop(columns=['Democrats 2016', 'Republicans 2016'])

In [None]:
target = df[['Democrats 2016', 'Republicans 2016']]

In [None]:
print(target.shape)
print(features.shape)

### Training the model

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( features, target, test_size=0.3, random_state=42)

In [None]:
print(X_test.shape)
print(X_train.shape)
print(y_test.shape)
print(y_train.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_X_train = scaler.fit_transform(X_train)
normalised_X_train = pd.DataFrame(normalised_X_train, columns=X_train.columns)

In [None]:
normalised_X_train.head()

In [None]:
normalised_X_test = scaler.transform(X_test)
normalised_X_test = pd.DataFrame(normalised_X_test, columns=X_test.columns)

In [None]:
normalised_X_test.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

In [None]:
max_depth = 30
regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100,
max_depth=max_depth,
random_state=0))
regr_multirf.fit(normalised_X_train, y_train)

regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth,
random_state=2)
regr_rf.fit(normalised_X_train, y_train)

In [None]:
y_multirf = regr_multirf.predict(normalised_X_test)
y_rf = regr_rf.predict(normalised_X_test)

In [None]:
y_multirf # comparing the predicted results

In [None]:
y_test.head() # Original result

In [None]:
from sklearn import metrics

In [None]:
# MAE values
MAE = metrics.mean_absolute_error(y_test,y_multirf)
round(MAE,2) # this tell us our model actually predicts an average of 2.76 more or less value which is very impressive 

In [None]:
# R Squared values
r2 = metrics.r2_score(y_test,y_multirf)
round(r2,2)  # This gives us a better rating of our model that it is actually very impressive, it tells us that 
               #  the regression line has fitted our dataset very well

In [None]:
# RMSE
RMSE = np.sqrt(metrics.mean_squared_error(y_test,y_multirf))
RMSE


IN GENERAL ALL THE METRICS SHOWN HERE GAVE US BETTER RESULT OF OUR MODEL

In [None]:
y_test.describe()

In [None]:
y_multirf = pd.DataFrame(y_multirf)

In [None]:
y_multirf.sum() # Republicans emerged winners according to this discription

In [None]:
sns.jointplot(data = df, x = 'Democrats 2016', y = 'Votes')

In [None]:
sns.jointplot(data = df, x = 'Republicans 2016', y = 'Votes')