## Did the impeachment skew the polls significantly?
## Was it predicatable?

Let's review the timeline of everything before we start
 
- September-November : Inquiry
- December 18, 2020 : Impeached by US HoR
- January 16, 2020 : Passed the Senate
- February 5, 2020 : Acquitted

In [1]:
import pandas as pd
primary_df = pd.read_csv("president_primary_polls.csv")
primary_df.head()
primary_df.dtypes

  interactivity=interactivity, compiler=compiler, result=result)


question_id               int64
poll_id                   int64
cycle                     int64
state                    object
pollster_id               int64
pollster                 object
sponsor_ids              object
sponsors                 object
display_name             object
pollster_rating_id      float64
pollster_rating_name     object
fte_grade                object
sample_size             float64
population               object
population_full          object
methodology              object
office_type              object
start_date               object
end_date                 object
sponsor_candidate        object
internal                   bool
partisan                 object
tracking                   bool
nationwide_batch           bool
created_at               object
notes                    object
url                      object
stage                    object
party                    object
answer                   object
candidate_id              int64
candidat

Create a training and testing model

In [2]:
primary_train = primary_df.loc[:13321].copy()
primary_test = primary_df.loc[13320:].copy()
primary_train['candidate_name'].fillna('', inplace=True)
primary_train['state'].fillna('', inplace=True)

In [3]:
X_Train = primary_train.copy(deep=True)
X_Train = X_Train[["candidate_name"]]
X_Train.head(3)

Unnamed: 0,candidate_name
0,Bernard Sanders
1,Elizabeth Warren
2,Joseph R. Biden Jr.


In [4]:
X_Test = primary_test.copy(deep=True)
X_Test = X_Test[["candidate_name"]]
X_Test.head(3)

Unnamed: 0,candidate_name
13320,Jay Robert Inslee
13321,Tim Ryan
13322,Tulsi Gabbard


In [5]:
from sklearn.linear_model import LinearRegression
# Use TFIDF to find important features from the 'description' column
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(lowercase=True,use_idf=True, max_features=100, stop_words='english')
 
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(X_Train["candidate_name"])

df_train_tfidf = pd.DataFrame(tfidf_vectorizer_vectors.todense(),columns=tfidf_vectorizer.get_feature_names())

df_train_tfidf.head()

Unnamed: 0,amy,andrew,bennet,bernard,beto,biden,blasio,bloomberg,bloxham,booker,...,trump,tulsi,walsh,warren,wayne,weld,wells,william,williamson,yang
0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.794442,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_train = pd.concat([X_Train, df_train_tfidf], axis=1)
df_train.head(2)

Unnamed: 0,candidate_name,amy,andrew,bennet,bernard,beto,biden,blasio,bloomberg,bloxham,...,trump,tulsi,walsh,warren,wayne,weld,wells,william,williamson,yang
0,Bernard Sanders,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Elizabeth Warren,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(primary_df.candidate_name).toarray()
labels = primary_df.state
features.shape

(17762, 157)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


primary_df['candidate_name'].fillna('', inplace=True)
primary_df['state'].fillna('none', inplace=True)

x_train, x_test, y_train, y_test = train_test_split(primary_df['state'], primary_df['candidate_name'])
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
tfid_transformer = TfidfTransformer()
x_train_tfidf = tfid_transformer.fit_transform(x_train_counts)

clf = MultinomialNB().fit(x_train_tfidf, y_train)


In [9]:
print(clf.predict(count_vect.transform(["Iowa"])))

['Elizabeth Warren']


What was Donald Trump's approval rating during his impeachment trial?

In [10]:
approval_df = pd.read_csv("president_approval_polls.csv")
approval_df.head()

Unnamed: 0,question_id,poll_id,cycle,state,politician,pollster_id,pollster,sponsor_ids,sponsors,display_name,...,start_date,end_date,sponsor_candidate,tracking,created_at,notes,url,source,yes,no
0,119194,64608,2020.0,,Donald Trump,399,Rasmussen Reports/Pulse Opinion Research,,,Rasmussen Reports/Pulse Opinion Research,...,2/25/20,2/27/20,,True,2/28/20 09:27,,http://www.rasmussenreports.com/public_content...,538,47.0,52.0
1,119031,64571,2020.0,,Donald Trump,1189,Morning Consult,,,Morning Consult,...,2/24/20,2/26/20,,,2/26/20 15:33,,https://morningconsult.com/wp-content/uploads/...,538,42.0,53.0
2,119032,64571,2020.0,,Donald Trump,1189,Morning Consult,,,Morning Consult,...,2/24/20,2/26/20,,,2/26/20 15:33,,https://morningconsult.com/wp-content/uploads/...,538,43.0,55.0
3,119075,64583,2020.0,,Donald Trump,399,Rasmussen Reports/Pulse Opinion Research,,,Rasmussen Reports/Pulse Opinion Research,...,2/24/20,2/26/20,,True,2/27/20 09:26,,http://www.rasmussenreports.com/public_content...,538,52.0,47.0
4,119267,64591,2020.0,,Donald Trump,590,Fox News/Beacon Research/Shaw & Co. Research,,,Fox News,...,2/23/20,2/26/20,,,2/28/20 18:00,,https://www.scribd.com/document/449414637/Fox-...,538,47.0,52.0


In [11]:
approval_df.dtypes

question_id               int64
poll_id                   int64
cycle                   float64
state                   float64
politician               object
pollster_id               int64
pollster                 object
sponsor_ids              object
sponsors                 object
display_name             object
pollster_rating_id      float64
pollster_rating_name     object
fte_grade                object
sample_size               int64
population               object
population_full          object
methodology              object
start_date               object
end_date                 object
sponsor_candidate       float64
tracking                 object
created_at               object
notes                    object
url                      object
source                   object
yes                     float64
no                      float64
dtype: object

In [13]:
'''primary_train = primary_df.loc[:13321].copy()
primary_test = primary_df.loc[13320:].copy()
primary_train['candidate_name'].fillna('', inplace=True)
primary_train['state'].fillna('', inplace=True)

X_Train = primary_train.copy(deep=True)
X_Train = X_Train[["candidate_name"]]
X_Train.head(3)

X_Test = primary_test.copy(deep=True)
X_Test = X_Test[["candidate_name"]]
X_Test.head(3)

'''

approval_train = approval_df.loc[:5133].copy()
approval_test = approval_df.loc[5144:].copy()
#approval_train['created_at'].fillna('', inplace=True)





In [16]:
approval_X_Train = approval_train.copy(deep=True)
approval_X_Train = approval_X_Train[["poll_id"]]
approval_X_Train.head()

Unnamed: 0,poll_id
0,64608
1,64571
2,64571
3,64583
4,64591


In [15]:
approval_X_Test = approval_train.copy(deep=True)
approval_X_Test = approval_X_Train[["poll_id"]]
approval_X_Test.head()

Unnamed: 0,created_at
0,2/28/20 09:27
1,2/26/20 15:33
2,2/26/20 15:33
3,2/27/20 09:26
4,2/28/20 18:00
