In [83]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score

#### Data read-in

In [84]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


#### Filtering for english

In [85]:
data = data_all[data_all['language'] == 'English'] 
data

Unnamed: 0,text,label,language
0,wearing a fake engagement ring so guys won’t a...,1.8,English
1,Bees vs. Wasps. http,1.0,English
2,Here is a nice equation: 0+0-0-0+0=0,1.0,English
3,@user @user Enjoy each new day!😊🇨🇦🐞🐭,1.6,English
4,I can be having a perfectly good day then I th...,1.6,English
...,...,...,...
1582,PSA!!! Even though I’m busy 99.99999% of the t...,2.2,English
1583,@user @OtterBox Isnt that the only reason we b...,1.6,English
1584,#NetajiSubhasChandraBose The ART The ARTIST http,1.0,English
1585,Nothing compares with being with someone who a...,3.6,English


#### Cleaning the data

In [86]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

#data['text'] = data['text'].apply(clean)

#data.head()

#### Tokenizing and Lemmatization

In [87]:
nltk.download('punkt')
nltk.download('wordnet')


def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['text'] = data['text'].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(tokenize_and_lemmatize)


#### Removing Stopwords

In [88]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


#data['text'] = data['text'].apply(remove_stopwords)


print(data.head())

                                                text  label language
0  wearing a fake engagement ring so guy won ’ t ...    1.8  English
1                              Bees vs. Wasps . http    1.0  English
2              Here is a nice equation : 0+0-0-0+0=0    1.0  English
3           @ user @ user Enjoy each new day ! 😊🇨🇦🐞🐭    1.6  English
4  I can be having a perfectly good day then I th...    1.6  English


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Feature extraction using TF-IDF

In [89]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

#### Train and Test Split

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Creating a function to predict and evaluate the prediction

In [91]:
def models(mod, X_tr, y_tr, X_ts, y_ts):
    model = mod
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_ts)
    
    pearson_r, _ = pearsonr(y_pred, y_ts)
    print("Pearson's r for", model, "is: " , pearson_r)
    
    def calculate_pearson(mod, X_ts, y_ts):
        y_pred_1 = mod.predict(X_ts)
        pearson, _ = pearsonr(y_pred_1, y_ts)
        return pearson
    
    pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
    print("Pearson's r for", model, "after cross validation is: " , pearson_cv)
    
    mse = mean_squared_error(y_pred, y_ts)
    print("Mean Square Error for", model, "is: " , mse)
      
    mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
    print("Mean Square Error for", model, "after cross validation is: " , mse_cv)
    
    return pearson_r, mse, mse_cv

#### Linear Regression

In [92]:
lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test)

Pearson's r for LinearRegression() is:  0.48748073395660296
Pearson's r for LinearRegression() after cross validation is:  [0.40540163 0.36372856 0.53897348 0.48927833 0.38317135]
Mean Square Error for LinearRegression() is:  0.6537288560185293
Mean Square Error for LinearRegression() after cross validation is:  [0.45532563 0.51262575 0.68307686 0.6286552  0.85871471]


#### Service Vector Regression

In [93]:
svr = models(svm.SVR(), X_train, y_train, X_test, y_test)

Pearson's r for SVR() is:  0.5589728615357136
Pearson's r for SVR() after cross validation is:  [0.34838581 0.35186473 0.51248508 0.43739434 0.45564716]
Mean Square Error for SVR() is:  0.5462805775352324
Mean Square Error for SVR() after cross validation is:  [0.47848233 0.48525128 0.86575966 0.6355661  0.93772615]


#### Decision Tree Regressor

In [94]:
dt = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Pearson's r for DecisionTreeRegressor() is:  0.33661627975044517
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.09223411 0.23876046 0.26439235 0.34605805 0.4894922 ]
Mean Square Error for DecisionTreeRegressor() is:  0.9564150943396228
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [1.15681858 0.99013889 1.21187934 1.18948854 0.96849206]


#### Ridge Regression

In [95]:
ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test)

Pearson's r for Ridge(alpha=0.9) is:  0.5535602390519943
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.43555899 0.36673971 0.54639142 0.50302281 0.45964583]
Mean Square Error for Ridge(alpha=0.9) is:  0.5249331670192434
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.43306106 0.50136195 0.74043324 0.57785217 0.83841072]


#### Random Forest Regressor

In [96]:
rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.4722185276469098
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.29646177 0.31578047 0.40461505 0.48231666 0.59123926]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.605313924970519
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.5360217  0.53534536 0.88692599 0.57838013 0.8676906 ]
