# HW 4 : Analysing the sentiment of open-ended survey responses about immigration

In [5]:
import pandas as pd
import numpy as np

tt = pd.read_csv('immSurvey.csv')
tt.head()

alphas = tt.stanMeansNewSysPooled
sample = tt.textToSend

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)
X

pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

#down-weighting frequent words; term frequency–inverse document frequency (TF–IDF), which weights the word counts by a measure of how often they appear in the documents
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, alphas,
random_state=1)

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

rbf = ConstantKernel(1.0) * RBF(length_scale=1.0)
gpr = GaussianProcessRegressor(kernel=rbf, alpha=1e-8)

gpr.fit(Xtrain.toarray(), ytrain)

# Compute posterior predictive mean and covariance
mu_s, cov_s = gpr.predict(Xtest.toarray(), return_cov=True)

#test correlation between test and mus
np.corrcoef(ytest, mu_s)

array([[1.        , 0.68328523],
       [0.68328523, 1.        ]])

# Extending to include bigrams, frequency of word pairs

## Convert a collection of text documents to a matrix of token counts

In [6]:
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', min_df=1)
X = bigram_vectorizer.fit_transform(sample)
print(bigram_vectorizer.get_feature_names())

['1 3', '1 became', '1 boarders', '1 difficult', '1 immigrants', '1 is', '1 language', '1 not', '11 style', '12 million', '125 and', '18 miles', '1b my', '2 if', '2 illegals', '2 immigrants', '2 process', '2 that', '2 years', '3 high', '3 immigrants', '3 legals', '3 of', '3 they', '3 women', '4 that', '4 too', '5 immigration', '5 of', '5 years', '600 a', '9 11', '95 of', 'a artificially', 'a better', 'a bible', 'a bit', 'a burden', 'a catholic', 'a certain', 'a chance', 'a citizen', 'a comfortable', 'a complet', 'a country', 'a crime', 'a criminal', 'a crutial', 'a daily', 'a day', 'a deal', 'a decent', 'a degree', 'a difference', 'a different', 'a difficult', 'a discount', 'a doctor', 'a dominant', 'a double', 'a drain', 'a economical', 'a fence', 'a fine', 'a flat', 'a fortune', 'a free', 'a giant', 'a good', 'a greater', 'a green', 'a heavy', 'a higher', 'a hold', 'a job', 'a license', 'a little', 'a lot', 'a low', 'a major', 'a melting', 'a minority', 'a month', 'a nation', 'a neol

In [7]:
pd.DataFrame(X.toarray(), columns=bigram_vectorizer.get_feature_names())

Unnamed: 0,1 3,1 became,1 boarders,1 difficult,1 immigrants,1 is,1 language,1 not,11 style,12 million,...,you come,you do,you mean,you think,you want,young from,your poor,your tired,your weak,yrars ago
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split data into test set and training set, I chose 50 : 50 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, alphas, test_size=0.5, random_state=42)

## Rerun the GP to see if the correlation changed between the estimates and the ground-truth.

In [16]:
rbf = ConstantKernel(1.0) * RBF(length_scale=1.0)
gpr = GaussianProcessRegressor(kernel=rbf, alpha=1e-8)

gpr.fit(X_train.toarray(), y_train)

mu_s, cov_s = gpr.predict(X_test.toarray(), return_cov=True)

np.corrcoef(y_test, mu_s)

array([[1.        , 0.38015479],
       [0.38015479, 1.        ]])

## There was a considerable change when the test set and the training set are divided equally,  I also tried with the default split values. 75 : 25

In [18]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
...     X, alphas, random_state=42)

In [19]:
gpr.fit(X_train_2.toarray(), ytrain)

mu_s_2, cov_s_2 = gpr.predict(X_test_2.toarray(), return_cov=True)

np.corrcoef(y_test_2, mu_s_2)

array([[1.        , 0.22500947],
       [0.22500947, 1.        ]])

## With a larger training set, I got a much lower correlation than the value I got for the 50:50 split.