## Did the impeachment skew the polls significantly?
## Was it predicatable?

Let's review the timeline of everything before we start
 
- September-November : Inquiry
- December 18, 2020 : Impeached by US HoR
- January 16, 2020 : Passed the Senate
- February 5, 2020 : Acquitted

In [1]:
import pandas as pd
primary_df = pd.read_csv("president_primary_polls.csv")
primary_df.head()
primary_df.dtypes

  interactivity=interactivity, compiler=compiler, result=result)


question_id               int64
poll_id                   int64
cycle                     int64
state                    object
pollster_id               int64
pollster                 object
sponsor_ids              object
sponsors                 object
display_name             object
pollster_rating_id      float64
pollster_rating_name     object
fte_grade                object
sample_size             float64
population               object
population_full          object
methodology              object
office_type              object
start_date               object
end_date                 object
sponsor_candidate        object
internal                   bool
partisan                 object
tracking                   bool
nationwide_batch           bool
created_at               object
notes                    object
url                      object
stage                    object
party                    object
answer                   object
candidate_id              int64
candidat

Create a training and testing model

In [2]:
primary_train = primary_df.loc[:13321].copy()
primary_test = primary_df.loc[13320:].copy()
primary_train['candidate_name'].fillna('', inplace=True)
primary_train['state'].fillna('', inplace=True)

In [3]:
X_Train = primary_train.copy(deep=True)
X_Train = X_Train[["candidate_name"]]
X_Train.head(3)

Unnamed: 0,candidate_name
0,Bernard Sanders
1,Elizabeth Warren
2,Joseph R. Biden Jr.


In [4]:
X_Test = primary_test.copy(deep=True)
X_Test = X_Test[["candidate_name"]]
X_Test.head(3)

Unnamed: 0,candidate_name
13320,Jay Robert Inslee
13321,Tim Ryan
13322,Tulsi Gabbard


In [5]:
from sklearn.linear_model import LinearRegression
# Use TFIDF to find important features from the 'description' column
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(lowercase=True,use_idf=True, max_features=100, stop_words='english')
 
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(X_Train["candidate_name"])

df_train_tfidf = pd.DataFrame(tfidf_vectorizer_vectors.todense(),columns=tfidf_vectorizer.get_feature_names())

df_train_tfidf.head()

Unnamed: 0,amy,andrew,bennet,bernard,beto,biden,blasio,bloomberg,bloxham,booker,...,trump,tulsi,walsh,warren,wayne,weld,wells,william,williamson,yang
0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.794442,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_train = pd.concat([X_Train, df_train_tfidf], axis=1)
df_train.head(2)

Unnamed: 0,candidate_name,amy,andrew,bennet,bernard,beto,biden,blasio,bloomberg,bloxham,...,trump,tulsi,walsh,warren,wayne,weld,wells,william,williamson,yang
0,Bernard Sanders,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Elizabeth Warren,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(primary_df.candidate_name).toarray()
labels = primary_df.state
features.shape

(17762, 157)

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


primary_df['candidate_name'].fillna('', inplace=True)
primary_df['state'].fillna('none', inplace=True)

x_train, x_test, y_train, y_test = train_test_split(primary_df['state'], primary_df['candidate_name'])
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
tfid_transformer = TfidfTransformer()
x_train_tfidf = tfid_transformer.fit_transform(x_train_counts)

clf = MultinomialNB().fit(x_train_tfidf, y_train)


In [29]:
print(clf.predict(count_vect.transform(["Iowa"])))

['Joseph R. Biden Jr.']
