In [1]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score



#### Data read-in

In [2]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


#### Filtering for english

In [3]:
data = data_all[data_all['language'] == 'English'] 
data

Unnamed: 0,text,label,language
0,wearing a fake engagement ring so guys won’t a...,1.8,English
1,Bees vs. Wasps. http,1.0,English
2,Here is a nice equation: 0+0-0-0+0=0,1.0,English
3,@user @user Enjoy each new day!😊🇨🇦🐞🐭,1.6,English
4,I can be having a perfectly good day then I th...,1.6,English
...,...,...,...
1582,PSA!!! Even though I’m busy 99.99999% of the t...,2.2,English
1583,@user @OtterBox Isnt that the only reason we b...,1.6,English
1584,#NetajiSubhasChandraBose The ART The ARTIST http,1.0,English
1585,Nothing compares with being with someone who a...,3.6,English


#### Cleaning the data

In [4]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

#data['text'] = data['text'].apply(clean)

#data.head()

#### Tokenizing and Lemmatization

In [5]:
nltk.download('punkt')
nltk.download('wordnet')


def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['text'] = data['text'].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(tokenize_and_lemmatize)


#### Removing Stopwords

In [6]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


#data['text'] = data['text'].apply(remove_stopwords)


print(data.head())

                                                text  label language
0  wearing a fake engagement ring so guy won ’ t ...    1.8  English
1                              Bees vs. Wasps . http    1.0  English
2              Here is a nice equation : 0+0-0-0+0=0    1.0  English
3           @ user @ user Enjoy each new day ! 😊🇨🇦🐞🐭    1.6  English
4  I can be having a perfectly good day then I th...    1.6  English


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Feature extraction using TF-IDF

In [7]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

In [8]:
print("Vocabulary size of the vectorizer:", len(tfidf_vectorizer.get_feature_names()))

Vocabulary size of the vectorizer: 4464




#### Train and Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Creating a function to predict and evaluate the prediction

In [10]:
def models(mod, X_tr, y_tr, X_ts, y_ts):
    model = mod
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_ts)
    
    pearson_r, _ = pearsonr(y_pred, y_ts)
    print("Pearson's r for", model, "is: " , pearson_r)
    
    def calculate_pearson(mod, X_ts, y_ts):
        y_pred_1 = mod.predict(X_ts)
        pearson, _ = pearsonr(y_pred_1, y_ts)
        return pearson
    
    pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
    print("Pearson's r for", model, "after cross validation is: " , pearson_cv)
    
    mse = mean_squared_error(y_pred, y_ts)
    print("Mean Square Error for", model, "is: " , mse)
      
    mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
    print("Mean Square Error for", model, "after cross validation is: " , mse_cv)
    
    return pearson_r, mse, mse_cv

#### Linear Regression

In [11]:
lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test)

Pearson's r for LinearRegression() is:  0.48748073395660296
Pearson's r for LinearRegression() after cross validation is:  [0.40540163 0.36372856 0.53897348 0.48927833 0.38317135]
Mean Square Error for LinearRegression() is:  0.6537288560185293
Mean Square Error for LinearRegression() after cross validation is:  [0.45532563 0.51262575 0.68307686 0.6286552  0.85871471]


#### Support Vector Regression

In [12]:
svr = models(svm.SVR(), X_train, y_train, X_test, y_test)

Pearson's r for SVR() is:  0.5589728615357136
Pearson's r for SVR() after cross validation is:  [0.34838581 0.35186473 0.51248508 0.43739434 0.45564716]
Mean Square Error for SVR() is:  0.5462805775352324
Mean Square Error for SVR() after cross validation is:  [0.47848233 0.48525128 0.86575966 0.6355661  0.93772615]


#### Decision Tree Regressor

In [13]:
dt = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Pearson's r for DecisionTreeRegressor() is:  0.28583826121790734
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.05436911 0.2072011  0.30438374 0.32836135 0.51283188]
Mean Square Error for DecisionTreeRegressor() is:  1.0129559748427672
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [1.04541667 0.9231467  1.11252604 1.04025132 0.96048942]


#### Ridge Regression

In [14]:
ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test)

Pearson's r for Ridge(alpha=0.9) is:  0.5535602390519943
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.43555899 0.36673971 0.54639142 0.50302281 0.45964583]
Mean Square Error for Ridge(alpha=0.9) is:  0.5249331670192434
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.43306106 0.50136195 0.74043324 0.57785217 0.83841072]


#### Random Forest Regressor

In [15]:
rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.4722185276469098
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.29646177 0.31578047 0.40461505 0.48231666 0.59123926]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.605313924970519
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.5360217  0.53534536 0.88692599 0.57838013 0.8676906 ]


# Evaluation on the test data

In [16]:
test_data = pd.read_csv('test.csv')
print(test_data)

                                                   text  label    language
0                                             @user 아..    3.0      Korean
1                                @user @user je rêve ??    2.2      French
2                                          thank u, nxt    1.0     English
3                            @user ma che cosa HO FATTO    2.6     Italian
4      在教室打飞机，站累了，就搬凳子坐下，站着坐着都是你爸爸，听爸爸的话哦！骚货，爸爸爱你。 http    3.0     Chinese
...                                                 ...    ...         ...
3876  @user Não sei se ele vai terminar com a Kyra (...    3.0  Portuguese
3877  @user Coitada...fraquinha....Povo precisa sabe...    1.4  Portuguese
3878  @user أي عطر حبيبي .... مساعدة بشار البعثي عطر ؟؟    3.5      Arabic
3879  TSC promoveert naar tweede klasse, ontgoocheli...    1.0       Dutch
3880  @user if you pull up to emmas house bald tomor...    1.8     English

[3881 rows x 3 columns]


In [17]:
# Filter for english
test = test_data[test_data['language'] == 'English']

# Data Cleaning 
#test['text'] = test['text'].apply(clean)

# Tokenizing and Lemmatization
test['text'] = test['text'].apply(tokenize_and_lemmatize)

# Removing Stopwords
#test['text'] = test['text'].apply(remove_stopwords)
print(test)

# Feature extraction using TF-IDF
test_X = tfidf_vectorizer.transform(test['text'])
test_y = test['label']

                                                   text  label language
2                                         thank u , nxt    1.0  English
19                          @ user It already got weird    2.4  English
22                             fuck going outside today    1.6  English
37    still pinching myself that i acc live in Amste...    2.4  English
65                  @ user SKY scored 4 le run just lol    1.2  English
...                                                 ...    ...      ...
3843  Out of body , that ’ s just how I feel when I ...    2.5  English
3846  @ user how come you got my new mobile number ,...    2.4  English
3862  @ user @ MarkDice @ AOC Lol . When did that ha...    1.0  English
3868                         @ justinbieber YUMMY SLAPS    1.8  English
3880  @ user if you pull up to emmas house bald tomo...    1.8  English

[396 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['text'] = test['text'].apply(tokenize_and_lemmatize)


#### Linear Regression

In [18]:
lr_test = models(linear_model.LinearRegression(), X_train, y_train, test_X, test_y)

Pearson's r for LinearRegression() is:  0.3062189669362372
Pearson's r for LinearRegression() after cross validation is:  [0.37781721 0.32433793 0.28183064 0.2001658  0.28739253]
Mean Square Error for LinearRegression() is:  1.010727566823603
Mean Square Error for LinearRegression() after cross validation is:  [0.78544549 0.78599449 0.77769023 0.98159523 0.89147194]


#### Support Vector Regression

In [19]:
svr_test = models(svm.SVR(), X_train, y_train, test_X, test_y)

Pearson's r for SVR() is:  0.43673125579287914
Pearson's r for SVR() after cross validation is:  [0.25546581 0.4738948  0.46406376 0.38551605 0.39669476]
Mean Square Error for SVR() is:  0.6105717864288448
Mean Square Error for SVR() after cross validation is:  [0.65694103 0.60278083 0.64805818 0.75831105 0.59530396]


#### Decision Tree Regressor

In [20]:
dt_test = models(tree.DecisionTreeRegressor(), X_train, y_train, test_X, test_y)

Pearson's r for DecisionTreeRegressor() is:  0.29131083940488606
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.18291013 0.36165173 0.01496095 0.16486953 0.33022714]
Mean Square Error for DecisionTreeRegressor() is:  1.0958354377104378
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [1.25022194 0.81928622 1.23499473 1.13863748 1.07702532]


#### Ridge Regressor

In [21]:
ridge_test = models(linear_model.Ridge(alpha=0.9), X_train, y_train, test_X, test_y)

Pearson's r for Ridge(alpha=0.9) is:  0.4534318784761866
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.31912354 0.46207198 0.3964309  0.37381569 0.42418957]
Mean Square Error for Ridge(alpha=0.9) is:  0.5979681713891573
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.6442856  0.57224037 0.63172964 0.73815057 0.57786503]


#### Random Forest Regressor

In [22]:
rf_test = models(RandomForestRegressor(random_state=42), X_train, y_train, test_X, test_y)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.3963176394600351
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.25154021 0.39087829 0.20036001 0.39847038 0.38134706]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.677883246462016
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.8251734  0.65278848 0.83978557 0.74304164 0.67704338]
