In [10]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score

#### Data read-in

In [11]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


#### Filtering for english

In [12]:
data = data_all[data_all['language'] == 'English'] 
data

Unnamed: 0,text,label,language
0,wearing a fake engagement ring so guys won’t a...,1.8,English
1,Bees vs. Wasps. http,1.0,English
2,Here is a nice equation: 0+0-0-0+0=0,1.0,English
3,@user @user Enjoy each new day!😊🇨🇦🐞🐭,1.6,English
4,I can be having a perfectly good day then I th...,1.6,English
...,...,...,...
1582,PSA!!! Even though I’m busy 99.99999% of the t...,2.2,English
1583,@user @OtterBox Isnt that the only reason we b...,1.6,English
1584,#NetajiSubhasChandraBose The ART The ARTIST http,1.0,English
1585,Nothing compares with being with someone who a...,3.6,English


#### Cleaning the data

In [13]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

#data['text'] = data['text'].apply(clean)

#data.head()

#### Tokenizing and Lemmatization

In [14]:
nltk.download('punkt')
nltk.download('wordnet')


def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['text'] = data['text'].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(tokenize_and_lemmatize)


#### Removing Stopwords

In [15]:
from nltk.corpus import stopwords

# Download NLTK stop words
nltk.download('stopwords')

stop_words_english = set(stopwords.words('english'))
stop_words_chinese = set(stopwords.words('chinese'))
stop_words_french = set(stopwords.words('french'))
stop_words_italian = set(stopwords.words('italian'))
stop_words_portuguese = set(stopwords.words('portuguese'))
stop_words_spanish = set(stopwords.words('spanish'))


# Function to remove stop words for english
def remove_stopwords_english(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_english]
    return ' '.join(filtered_words)

#english['text'] = english['text'].apply(remove_stopwords_english)


# Function to remove stop words for english
def remove_stopwords_chinese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_chinese]
    return ' '.join(filtered_words)

#chinese['text'] = chinese['text'].apply(remove_stopwords_chinese)


# Function to remove stop words for english
def remove_stopwords_french(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_french]
    return ' '.join(filtered_words)

#french['text'] = french['text'].apply(remove_stopwords_french)


# Function to remove stop words for english
def remove_stopwords_italian(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_italian]
    return ' '.join(filtered_words)

#italian['text'] = italian['text'].apply(remove_stopwords_italian)


# Function to remove stop words for english
def remove_stopwords_portuguese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_portuguese]
    return ' '.join(filtered_words)

#portuguese['text'] = portuguese['text'].apply(remove_stopwords_portuguese)


# Function to remove stop words for english
def remove_stopwords_spanish(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_spanish]
    return ' '.join(filtered_words)

#spanish['text'] = spanish['text'].apply(remove_stopwords_spanish)



#data  = pd.concat([english, chinese, french, italian, portuguese, spanish]

#print(data.head())
#print("Num rows:", len(data))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Creating datasets by language

In [16]:
english = data_all[data_all['language'] == 'English'] 
chinese = data_all[data_all['language'] == 'Chinese'] 
french = data_all[data_all['language'] == 'French'] 
italian = data_all[data_all['language'] == 'Italian'] 
portuguese = data_all[data_all['language'] == 'Portuguese'] 
spanish = data_all[data_all['language'] == 'Spanish'] 

In [17]:
def model_individual_lang(data, lang):
    #Feature extraction using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    X = tfidf_vectorizer.fit_transform(data['text'])
    y = data['label']
    
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    #Tran and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    #Model prediction and evaluation
    def models(mod, X_tr, y_tr, X_ts, y_ts, lang, test_data, name):
        model = mod
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_ts)

        pearson_r, _ = pearsonr(y_pred, y_ts)
        print("Pearson's r for", model, "for", lang, "is: " , pearson_r)

        def calculate_pearson(mod, X_ts, y_ts):
            y_pred_1 = mod.predict(X_ts)
            pearson, _ = pearsonr(y_pred_1, y_ts)
            return pearson

        pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
        print("Pearson's r for", model, "for", lang, "after cross validation is: " , pearson_cv)

        mse = mean_squared_error(y_pred, y_ts)
        print("Mean Square Error for", model, "for", lang, "is: " , mse)

        mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
        print("Mean Square Error for", model, "for", lang, "after cross validation is: " , mse_cv)
        

        test_check = pd.DataFrame({
        'text': test_data['text'],
        'predicted_label': y_pred,
        'true_label': y_ts
        })
        
        #pearson_r_check, _ = pearsonr(y_pred, test_data['label'])
        #print("CHK Pearson's r for", model, "for", lang, "is: " , pearson_r_check)
              
        test_check.to_csv(f"{lang}_{name}.csv", index=False)
        
        return pearson_r, pearson_cv, mse, mse_cv, test_check
    
    print("************************************")
    lr_name = "lr"
    lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test, lang, test_data, lr_name)
    print("************************************")
    svr_name = "svr"
    svr = models(svm.SVR(), X_train, y_train, X_test, y_test, lang, test_data, svr_name)
    print("************************************")
    dt_name = "dtr"
    dtr = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test, lang, test_data, dt_name)
    print("************************************")
    ridge_name = "ridge"
    ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test, lang, test_data, ridge_name)
    print("************************************")
    rf_name = "rfr"
    rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test, lang, test_data, rf_name)
    print("************************************")
    return 

In [18]:
lang = "English"
eng = model_individual_lang(english, lang)

************************************
Pearson's r for LinearRegression() for English is:  0.492521445823381
Pearson's r for LinearRegression() for English after cross validation is:  [0.41779694 0.3079486  0.54645998 0.48231143 0.39029975]
Mean Square Error for LinearRegression() for English is:  0.6401628628071875
Mean Square Error for LinearRegression() for English after cross validation is:  [0.45029995 0.54914027 0.68958142 0.6265813  0.85607043]
************************************
Pearson's r for SVR() for English is:  0.5563765786268907
Pearson's r for SVR() for English after cross validation is:  [0.34154519 0.34355225 0.51552865 0.44263444 0.44932464]
Mean Square Error for SVR() for English is:  0.5502416941512925
Mean Square Error for SVR() for English after cross validation is:  [0.4793757  0.48553798 0.87152745 0.6366511  0.94259292]
************************************
Pearson's r for DecisionTreeRegressor() for English is:  0.3238295507779151
Pearson's r for DecisionTreeRe

In [19]:
lang = "Chinese"
eng = model_individual_lang(chinese, lang)

************************************
Pearson's r for LinearRegression() for Chinese is:  0.25313002583460475
Pearson's r for LinearRegression() for Chinese after cross validation is:  [ 0.12150478 -0.0069605   0.0422401   0.15857598  0.20201597]
Mean Square Error for LinearRegression() for Chinese is:  0.9346599982534893
Mean Square Error for LinearRegression() for Chinese after cross validation is:  [1.07564159 0.98464685 0.95709369 1.09272924 0.85099374]
************************************
Pearson's r for SVR() for Chinese is:  0.20338649590386815
Pearson's r for SVR() for Chinese after cross validation is:  [ 0.13302846 -0.0133223   0.02266632  0.11240775  0.17797016]
Mean Square Error for SVR() for Chinese is:  0.9761070063380892
Mean Square Error for SVR() for Chinese after cross validation is:  [1.13245661 0.97906459 0.93664346 1.1379655  0.84310666]
************************************
Pearson's r for DecisionTreeRegressor() for Chinese is:  0.03024984742685427
Pearson's r for 

In [20]:
lang = "Portuguese"
eng = model_individual_lang(portuguese, lang)

************************************
Pearson's r for LinearRegression() for Portuguese is:  0.2574391702209266
Pearson's r for LinearRegression() for Portuguese after cross validation is:  [0.27828481 0.29344697 0.55404532 0.27791585 0.23588118]
Mean Square Error for LinearRegression() for Portuguese is:  1.0938951699440238
Mean Square Error for LinearRegression() for Portuguese after cross validation is:  [0.91482291 0.70909955 0.52320288 0.57702426 0.82931557]
************************************
Pearson's r for SVR() for Portuguese is:  0.3909447065866024
Pearson's r for SVR() for Portuguese after cross validation is:  [0.17267038 0.36623538 0.42018758 0.21035825 0.26102788]
Mean Square Error for SVR() for Portuguese is:  0.6450516054891232
Mean Square Error for SVR() for Portuguese after cross validation is:  [0.95375023 0.64946422 0.64410994 0.49213557 0.81429614]
************************************
Pearson's r for DecisionTreeRegressor() for Portuguese is:  0.12055126293498095
P

In [21]:
lang = "Italian"
eng = model_individual_lang(italian, lang)

************************************
Pearson's r for LinearRegression() for Italian is:  0.035989570817294314
Pearson's r for LinearRegression() for Italian after cross validation is:  [0.21773757 0.25483045 0.35323925 0.3416285  0.28623938]
Mean Square Error for LinearRegression() for Italian is:  1.493188900108611
Mean Square Error for LinearRegression() for Italian after cross validation is:  [0.72690887 0.50511671 0.71971491 0.50422389 0.65501644]
************************************
Pearson's r for SVR() for Italian is:  0.4606001249135356
Pearson's r for SVR() for Italian after cross validation is:  [0.36104497 0.3295737  0.29377917 0.39798646 0.15155831]
Mean Square Error for SVR() for Italian is:  0.5363352108506688
Mean Square Error for SVR() for Italian after cross validation is:  [0.70169688 0.47139628 0.7948909  0.4848378  0.65407806]
************************************
Pearson's r for DecisionTreeRegressor() for Italian is:  0.23748026235304
Pearson's r for DecisionTreeRe

In [22]:
lang = "French"
eng = model_individual_lang(french, lang)

************************************
Pearson's r for LinearRegression() for French is:  0.18496168969174293
Pearson's r for LinearRegression() for French after cross validation is:  [0.2596664  0.24858583 0.17600121 0.19208147 0.04952975]
Mean Square Error for LinearRegression() for French is:  1.410117686724246
Mean Square Error for LinearRegression() for French after cross validation is:  [0.47582993 0.61659843 1.13525483 0.67900346 1.14736804]
************************************
Pearson's r for SVR() for French is:  0.36951759473622703
Pearson's r for SVR() for French after cross validation is:  [0.21239581 0.25265143 0.19190492 0.1192372  0.19354987]
Mean Square Error for SVR() for French is:  0.6730454835691447
Mean Square Error for SVR() for French after cross validation is:  [0.47625507 0.62056495 1.13061986 0.5729875  1.0109962 ]
************************************
Pearson's r for DecisionTreeRegressor() for French is:  0.25879322884890216
Pearson's r for DecisionTreeRegresso

In [23]:
lang = "Spanish"
eng = model_individual_lang(spanish, lang)

************************************
Pearson's r for LinearRegression() for Spanish is:  0.46006004202523126
Pearson's r for LinearRegression() for Spanish after cross validation is:  [0.26742633 0.05368369 0.38257655 0.45705684 0.30477561]
Mean Square Error for LinearRegression() for Spanish is:  0.7538114424237127
Mean Square Error for LinearRegression() for Spanish after cross validation is:  [0.98876824 0.94243982 0.73228016 0.77025104 0.90642608]
************************************
Pearson's r for SVR() for Spanish is:  0.5286745452961321
Pearson's r for SVR() for Spanish after cross validation is:  [0.37517392 0.29438328 0.51118631 0.42869045 0.33397245]
Mean Square Error for SVR() for Spanish is:  0.7124151838242583
Mean Square Error for SVR() for Spanish after cross validation is:  [0.97462503 0.75959338 0.73787658 0.87174828 0.91230856]
************************************
Pearson's r for DecisionTreeRegressor() for Spanish is:  0.23847083818080714
Pearson's r for DecisionTre