In [1]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score



In [2]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


In [6]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

data_all['text'] = data_all['text'].apply(clean)

data_all.head()

Unnamed: 0,text,label,language
0,wearing a fake engagement ring so guys won’t a...,1.8,English
1,Bees vs. Wasps.,1.0,English
2,Here is a nice equation: 0+0-0-0+0=0,1.0,English
3,Enjoy each new day!😊🇨🇦🐞🐭,1.6,English
4,I can be having a perfectly good day then I th...,1.6,English


In [7]:
nltk.download('punkt')
nltk.download('wordnet')

def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data_all['text'] = data_all['text'].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
english = data_all[data_all['language'] == 'English'] 
chinese = data_all[data_all['language'] == 'Chinese'] 
french = data_all[data_all['language'] == 'French'] 
italian = data_all[data_all['language'] == 'Italian'] 
portuguese = data_all[data_all['language'] == 'Portuguese'] 
spanish = data_all[data_all['language'] == 'Spanish'] 

In [10]:
from nltk.corpus import stopwords

# Download NLTK stop words
nltk.download('stopwords')

stop_words_english = set(stopwords.words('english'))
stop_words_chinese = set(stopwords.words('chinese'))
stop_words_french = set(stopwords.words('french'))
stop_words_italian = set(stopwords.words('italian'))
stop_words_portuguese = set(stopwords.words('portuguese'))
stop_words_spanish = set(stopwords.words('spanish'))


# Function to remove stop words for english
def remove_stopwords_english(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_english]
    return ' '.join(filtered_words)

english['text'] = english['text'].apply(remove_stopwords_english)


# Function to remove stop words for english
def remove_stopwords_chinese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_chinese]
    return ' '.join(filtered_words)

chinese['text'] = chinese['text'].apply(remove_stopwords_chinese)


# Function to remove stop words for english
def remove_stopwords_french(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_french]
    return ' '.join(filtered_words)

french['text'] = french['text'].apply(remove_stopwords_french)


# Function to remove stop words for english
def remove_stopwords_italian(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_italian]
    return ' '.join(filtered_words)

italian['text'] = italian['text'].apply(remove_stopwords_italian)


# Function to remove stop words for english
def remove_stopwords_portuguese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_portuguese]
    return ' '.join(filtered_words)

portuguese['text'] = portuguese['text'].apply(remove_stopwords_portuguese)


# Function to remove stop words for english
def remove_stopwords_spanish(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_spanish]
    return ' '.join(filtered_words)

spanish['text'] = spanish['text'].apply(remove_stopwords_spanish)


#data = english.union(chinese).union(french).union(italian).union(portuguese).union(spanish)
data  = pd.concat([english, chinese, french, italian, portuguese, spanish])

print(data.head())
print("Num rows:", len(data))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english['text'] = english['text'].apply(remove_stopwords_english)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chinese['text'] = chinese['text'].apply(remove_stopwords_chinese)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pyda

                                                text  label language
0  wearing fake engagement ring guy ’ approach li...    1.8  English
1                                   Bees vs. Wasps .    1.0  English
2                        nice equation : 0+0-0-0+0=0    1.0  English
3                              Enjoy new day ! 😊🇨🇦🐞🐭    1.6  English
4  perfectly good day think Star Wars episode 9 ’...    1.6  English
Num rows: 9491


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spanish['text'] = spanish['text'].apply(remove_stopwords_spanish)


In [11]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

In [12]:
# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
def models(mod, X_tr, y_tr, X_ts, y_ts):
    model = mod
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_ts)
    
    pearson_r, _ = pearsonr(y_pred, y_ts)
    print("Pearson's r for", model, "is: " , pearson_r)
    
    def calculate_pearson(mod, X_ts, y_ts):
        y_pred_1 = mod.predict(X_ts)
        pearson, _ = pearsonr(y_pred_1, y_ts)
        return pearson
    
    pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
    print("Pearson's r for", model, "after cross validation is: " , pearson_cv)
    
    mse = mean_squared_error(y_pred, y_ts)
    print("Mean Square Error for", model, "is: " , mse)
      
    mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
    print("Mean Square Error for", model, "after cross validation is: " , mse_cv)
    
    return pearson_r, pearson_cv, mse, mse_cv

In [14]:
lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test)

Pearson's r for LinearRegression() is:  0.26350199681555153
Pearson's r for LinearRegression() after cross validation is:  [0.19877439 0.13508835 0.21303962 0.16588023 0.27195991]
Mean Square Error for LinearRegression() is:  1.005263857846706
Mean Square Error for LinearRegression() after cross validation is:  [0.86838704 0.96796506 0.74087055 1.02163005 0.97667745]


In [15]:
svr = models(svm.SVR(), X_train, y_train, X_test, y_test)

Pearson's r for SVR() is:  0.3342116504282937
Pearson's r for SVR() after cross validation is:  [0.22433657 0.19372643 0.1997064  0.15200109 0.22838864]
Mean Square Error for SVR() is:  0.7282575852527919
Mean Square Error for SVR() after cross validation is:  [0.76627383 0.81064904 0.68233797 0.85278376 0.84383063]


In [16]:
dt = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Pearson's r for DecisionTreeRegressor() is:  0.18521762062953437
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.12673603 0.04032416 0.12890361 0.13694164 0.16324245]
Mean Square Error for DecisionTreeRegressor() is:  1.1288593089764314
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [1.15970518 1.27163819 1.05189108 1.29789935 1.21348063]


In [17]:
ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test)

Pearson's r for Ridge(alpha=0.9) is:  0.3476767409894662
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.21948103 0.2147082  0.18627174 0.18994466 0.2455353 ]
Mean Square Error for Ridge(alpha=0.9) is:  0.7138640514572226
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.76121111 0.78841457 0.69693663 0.82415719 0.81258364]


In [18]:
rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.29491105561884706
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.14338094 0.11686084 0.18885253 0.13635361 0.25542101]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.8080046952820029
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.91476511 1.00896749 0.74084973 1.0436324  1.0118512 ]
