In [61]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score

#### Data read-in

In [62]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


#### Cleaning the data

In [63]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

#data_all['text'] = data_all['text'].apply(clean)

#data_all.head()

#### Tokenizing and Lemmatization

In [64]:
nltk.download('punkt')
nltk.download('wordnet')

def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data_all['text'] = data_all['text'].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Creating datasets by language

In [65]:
english = data_all[data_all['language'] == 'English'] 
chinese = data_all[data_all['language'] == 'Chinese'] 
french = data_all[data_all['language'] == 'French'] 
italian = data_all[data_all['language'] == 'Italian'] 
portuguese = data_all[data_all['language'] == 'Portuguese'] 
spanish = data_all[data_all['language'] == 'Spanish'] 

#### Stopwords removal

In [66]:
from nltk.corpus import stopwords

# Download NLTK stop words
nltk.download('stopwords')

stop_words_english = set(stopwords.words('english'))
stop_words_chinese = set(stopwords.words('chinese'))
stop_words_french = set(stopwords.words('french'))
stop_words_italian = set(stopwords.words('italian'))
stop_words_portuguese = set(stopwords.words('portuguese'))
stop_words_spanish = set(stopwords.words('spanish'))


# Function to remove stop words for english
def remove_stopwords_english(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_english]
    return ' '.join(filtered_words)

#english['text'] = english['text'].apply(remove_stopwords_english)


# Function to remove stop words for english
def remove_stopwords_chinese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_chinese]
    return ' '.join(filtered_words)

#chinese['text'] = chinese['text'].apply(remove_stopwords_chinese)


# Function to remove stop words for english
def remove_stopwords_french(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_french]
    return ' '.join(filtered_words)

#french['text'] = french['text'].apply(remove_stopwords_french)


# Function to remove stop words for english
def remove_stopwords_italian(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_italian]
    return ' '.join(filtered_words)

#italian['text'] = italian['text'].apply(remove_stopwords_italian)


# Function to remove stop words for english
def remove_stopwords_portuguese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_portuguese]
    return ' '.join(filtered_words)

#portuguese['text'] = portuguese['text'].apply(remove_stopwords_portuguese)


# Function to remove stop words for english
def remove_stopwords_spanish(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_spanish]
    return ' '.join(filtered_words)

#spanish['text'] = spanish['text'].apply(remove_stopwords_spanish)



#data  = pd.concat([english, chinese, french, italian, portuguese, spanish])
data = data_all

print(data.head())
print("Num rows:", len(data))


                                                text  label language
0  wearing a fake engagement ring so guy won ’ t ...    1.8  English
1                              Bees vs. Wasps . http    1.0  English
2              Here is a nice equation : 0+0-0-0+0=0    1.0  English
3           @ user @ user Enjoy each new day ! 😊🇨🇦🐞🐭    1.6  English
4  I can be having a perfectly good day then I th...    1.6  English
Num rows: 9491


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Feature extraction using TF-IDF

In [67]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

#### Tran and Test Split

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Creating a function to predict and evaluate the prediction

In [69]:
def models(mod, X_tr, y_tr, X_ts, y_ts):
    model = mod
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_ts)
    
    pearson_r, _ = pearsonr(y_pred, y_ts)
    print("Pearson's r for", model, "is: " , pearson_r)
    
    def calculate_pearson(mod, X_ts, y_ts):
        y_pred_1 = mod.predict(X_ts)
        pearson, _ = pearsonr(y_pred_1, y_ts)
        return pearson
    
    pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
    print("Pearson's r for", model, "after cross validation is: " , pearson_cv)
    
    mse = mean_squared_error(y_pred, y_ts)
    print("Mean Square Error for", model, "is: " , mse)
      
    mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
    print("Mean Square Error for", model, "after cross validation is: " , mse_cv)
    
    return pearson_r, pearson_cv, mse, mse_cv

#### Linear Regression

In [70]:
lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test)

Pearson's r for LinearRegression() is:  0.23089700589113848
Pearson's r for LinearRegression() after cross validation is:  [0.22241299 0.29895702 0.25895541 0.27400264 0.31028882]
Mean Square Error for LinearRegression() is:  1.077527942861177
Mean Square Error for LinearRegression() after cross validation is:  [0.72403087 0.81785748 0.71789167 0.73061872 0.81930056]


#### Service Vector Regression

In [71]:
svr = models(svm.SVR(), X_train, y_train, X_test, y_test)

Pearson's r for SVR() is:  0.4127894693459429
Pearson's r for SVR() after cross validation is:  [0.30831422 0.30930053 0.29001737 0.34050096 0.3746493 ]
Mean Square Error for SVR() is:  0.6670610736297159
Mean Square Error for SVR() after cross validation is:  [0.6592582  0.79862815 0.68062319 0.69242026 0.8211113 ]


#### Decision Tree Regressor

In [72]:
dt = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Pearson's r for DecisionTreeRegressor() is:  0.21840287370360903
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.20424259 0.21834384 0.18292873 0.24837325 0.23768277]
Mean Square Error for DecisionTreeRegressor() is:  1.1645770473208241
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [1.2146531  1.46867333 1.23176543 1.25405482 1.36521841]


#### Ridge Regression

In [73]:
ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test)

Pearson's r for Ridge(alpha=0.9) is:  0.41045168257446313
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.28939259 0.31244344 0.29504246 0.34804303 0.36689302]
Mean Square Error for Ridge(alpha=0.9) is:  0.6613361135725191
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.66303318 0.7829483  0.66233566 0.67191569 0.78456122]


#### Random Forest Regressor

In [74]:
rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.3438763305236285
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.24358303 0.21107175 0.26506125 0.33971407 0.33833686]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.7570275702431948
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.81503506 1.01898314 0.83333417 0.77170308 0.94353945]
