In [1]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score



#### Data read-in

In [2]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


#### Filtering for english

In [3]:
data = data_all[data_all['language'] == 'English'] 
data

Unnamed: 0,text,label,language
0,wearing a fake engagement ring so guys won’t a...,1.8,English
1,Bees vs. Wasps. http,1.0,English
2,Here is a nice equation: 0+0-0-0+0=0,1.0,English
3,@user @user Enjoy each new day!😊🇨🇦🐞🐭,1.6,English
4,I can be having a perfectly good day then I th...,1.6,English
...,...,...,...
1582,PSA!!! Even though I’m busy 99.99999% of the t...,2.2,English
1583,@user @OtterBox Isnt that the only reason we b...,1.6,English
1584,#NetajiSubhasChandraBose The ART The ARTIST http,1.0,English
1585,Nothing compares with being with someone who a...,3.6,English


#### Cleaning the data

In [4]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

#data['text'] = data['text'].apply(clean)

#data.head()

#### Tokenizing and Lemmatization

In [5]:
nltk.download('punkt')
nltk.download('wordnet')


def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['text'] = data['text'].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(tokenize_and_lemmatize)


#### Creating datasets by language

In [6]:
english = data_all[data_all['language'] == 'English'] 
chinese = data_all[data_all['language'] == 'Chinese'] 
french = data_all[data_all['language'] == 'French'] 
italian = data_all[data_all['language'] == 'Italian'] 
portuguese = data_all[data_all['language'] == 'Portuguese'] 
spanish = data_all[data_all['language'] == 'Spanish'] 

#### Removing Stopwords

In [7]:
from nltk.corpus import stopwords

# Download NLTK stop words
nltk.download('stopwords')

stop_words_english = set(stopwords.words('english'))
stop_words_chinese = set(stopwords.words('chinese'))
stop_words_french = set(stopwords.words('french'))
stop_words_italian = set(stopwords.words('italian'))
stop_words_portuguese = set(stopwords.words('portuguese'))
stop_words_spanish = set(stopwords.words('spanish'))


# Function to remove stop words for english
def remove_stopwords_english(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_english]
    return ' '.join(filtered_words)

english['text'] = english['text'].apply(remove_stopwords_english)


# Function to remove stop words for chinese
def remove_stopwords_chinese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_chinese]
    return ' '.join(filtered_words)

chinese['text'] = chinese['text'].apply(remove_stopwords_chinese)


# Function to remove stop words for french
def remove_stopwords_french(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_french]
    return ' '.join(filtered_words)

french['text'] = french['text'].apply(remove_stopwords_french)


# Function to remove stop words for italian
def remove_stopwords_italian(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_italian]
    return ' '.join(filtered_words)

italian['text'] = italian['text'].apply(remove_stopwords_italian)


# Function to remove stop words for portuguese
def remove_stopwords_portuguese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_portuguese]
    return ' '.join(filtered_words)

portuguese['text'] = portuguese['text'].apply(remove_stopwords_portuguese)


# Function to remove stop words for spanish
def remove_stopwords_spanish(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_spanish]
    return ' '.join(filtered_words)

spanish['text'] = spanish['text'].apply(remove_stopwords_spanish)



#data  = pd.concat([english, chinese, french, italian, portuguese, spanish]

#print(data.head())
#print("Num rows:", len(data))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english['text'] = english['text'].apply(remove_stopwords_english)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chinese['text'] = chinese['text'].apply(remove_stopwords_chinese)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pyda

In [8]:
def model_individual_lang_val(data, lang):
    #Feature extraction using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    X = tfidf_vectorizer.fit_transform(data['text'])
    y = data['label']
    
    #Tran and validation split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    #Model prediction and evaluation
    def models(mod, X_tr, y_tr, X_ts, y_ts, lang, name):
        model = mod
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_ts)

        pearson_r, _ = pearsonr(y_pred, y_ts)
        print("Pearson's r for", model, "for", lang, "is: " , pearson_r)

        def calculate_pearson(mod, X_ts, y_ts):
            y_pred_1 = mod.predict(X_ts)
            pearson, _ = pearsonr(y_pred_1, y_ts)
            return pearson

        pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
        print("Pearson's r for", model, "for", lang, "after cross validation is: " , pearson_cv)

        mse = mean_squared_error(y_pred, y_ts)
        print("Mean Square Error for", model, "for", lang, "is: " , mse)

        mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
        print("Mean Square Error for", model, "for", lang, "after cross validation is: " , mse_cv)
        
        return pearson_r, pearson_cv, mse, mse_cv 
    
    print("************************************")
    lr_name = "lr"
    lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test, lang, lr_name)
    print("************************************")
    svr_name = "svr"
    svr = models(svm.SVR(), X_train, y_train, X_test, y_test, lang, svr_name)
    print("************************************")
    dt_name = "dtr"
    dtr = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test, lang, dt_name)
    print("************************************")
    ridge_name = "ridge"
    ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test, lang, ridge_name)
    print("************************************")
    rf_name = "rfr"
    rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test, lang, rf_name)
    print("************************************")
    return X_train, y_train

In [9]:
lang = "English"
eng = model_individual_lang_val(english, lang)

************************************
Pearson's r for LinearRegression() for English is:  0.31830267826863123
Pearson's r for LinearRegression() for English after cross validation is:  [0.18092105 0.34184677 0.42595256 0.40732081 0.31323752]
Mean Square Error for LinearRegression() for English is:  0.8339001045753336
Mean Square Error for LinearRegression() for English after cross validation is:  [0.5694965  0.47897352 0.94463562 0.63556505 0.98700244]
************************************
Pearson's r for SVR() for English is:  0.44111953205317617
Pearson's r for SVR() for English after cross validation is:  [0.13541934 0.2727585  0.37509373 0.32495094 0.34086805]
Mean Square Error for SVR() for English is:  0.623565553403764
Mean Square Error for SVR() for English after cross validation is:  [0.51591024 0.49637213 0.9414749  0.68264689 0.97404817]
************************************
Pearson's r for DecisionTreeRegressor() for English is:  0.20530009316889375
Pearson's r for DecisionTre

In [10]:
lang = "Chinese"
chi = model_individual_lang_val(chinese, lang)

************************************
Pearson's r for LinearRegression() for Chinese is:  0.2510941366964319
Pearson's r for LinearRegression() for Chinese after cross validation is:  [ 0.12150072 -0.0069595   0.04223008  0.1585723   0.20201589]
Mean Square Error for LinearRegression() for Chinese is:  0.9346324370899847
Mean Square Error for LinearRegression() for Chinese after cross validation is:  [1.07564231 0.98464629 0.95709696 1.09273041 0.85099368]
************************************
Pearson's r for SVR() for Chinese is:  0.2035664901694953
Pearson's r for SVR() for Chinese after cross validation is:  [ 0.13302349 -0.01332093  0.02265517  0.11240559  0.17797249]
Mean Square Error for SVR() for Chinese is:  0.9757658514238731
Mean Square Error for SVR() for Chinese after cross validation is:  [1.13245724 0.97906443 0.93664484 1.13796588 0.84310687]
************************************
Pearson's r for DecisionTreeRegressor() for Chinese is:  0.051758950307932226
Pearson's r for D

In [11]:
lang = "Portuguese"
port = model_individual_lang_val(portuguese, lang)

************************************
Pearson's r for LinearRegression() for Portuguese is:  0.25489238252788204
Pearson's r for LinearRegression() for Portuguese after cross validation is:  [0.18167572 0.19225798 0.33475427 0.25844449 0.10351257]
Mean Square Error for LinearRegression() for Portuguese is:  1.102185544054913
Mean Square Error for LinearRegression() for Portuguese after cross validation is:  [0.92962491 0.78375825 0.68401158 0.5711345  0.88770608]
************************************
Pearson's r for SVR() for Portuguese is:  0.3032123118091421
Pearson's r for SVR() for Portuguese after cross validation is:  [0.14030692 0.31430183 0.2869971  0.22968061 0.16244982]
Mean Square Error for SVR() for Portuguese is:  0.6890085112068417
Mean Square Error for SVR() for Portuguese after cross validation is:  [0.97013251 0.66959677 0.7064321  0.48721252 0.84524543]
************************************
Pearson's r for DecisionTreeRegressor() for Portuguese is:  0.12698269754027428
P

In [12]:
lang = "Italian"
ital = model_individual_lang_val(italian, lang)

************************************
Pearson's r for LinearRegression() for Italian is:  0.34355991680034015
Pearson's r for LinearRegression() for Italian after cross validation is:  [0.23965139 0.3153995  0.34128439 0.27570292 0.39322921]
Mean Square Error for LinearRegression() for Italian is:  0.7525090875317504
Mean Square Error for LinearRegression() for Italian after cross validation is:  [0.69657537 0.48542724 0.72632814 0.52335093 0.58349093]
************************************
Pearson's r for SVR() for Italian is:  0.4165523912086395
Pearson's r for SVR() for Italian after cross validation is:  [0.28607371 0.24970033 0.2731261  0.25571108 0.29443102]
Mean Square Error for SVR() for Italian is:  0.5578236798811332
Mean Square Error for SVR() for Italian after cross validation is:  [0.72258728 0.48685638 0.79528053 0.51624534 0.62196408]
************************************
Pearson's r for DecisionTreeRegressor() for Italian is:  0.2639022727749299
Pearson's r for DecisionTree

In [13]:
lang = "French"
fre = model_individual_lang_val(french, lang)

************************************
Pearson's r for LinearRegression() for French is:  0.1017600078693717
Pearson's r for LinearRegression() for French after cross validation is:  [ 0.07282151  0.1112085   0.00592574  0.04800037 -0.16775975]
Mean Square Error for LinearRegression() for French is:  1.090634520139914
Mean Square Error for LinearRegression() for French after cross validation is:  [0.57332812 0.71282156 1.25477693 0.63084472 1.21683259]
************************************
Pearson's r for SVR() for French is:  0.2108834218718691
Pearson's r for SVR() for French after cross validation is:  [ 0.11453364  0.1050511   0.0465609  -0.0493509  -0.01419225]
Mean Square Error for SVR() for French is:  0.7409413740620678
Mean Square Error for SVR() for French after cross validation is:  [0.49428408 0.6508171  1.16326827 0.59200123 1.05376641]
************************************
Pearson's r for DecisionTreeRegressor() for French is:  0.1576962276565338
Pearson's r for DecisionTreeR

In [14]:
lang = "Spanish"
spn = model_individual_lang_val(spanish, lang)

************************************
Pearson's r for LinearRegression() for Spanish is:  0.34835817755115384
Pearson's r for LinearRegression() for Spanish after cross validation is:  [ 0.00692504  0.0132769   0.31035647 -0.08479288  0.14223261]
Mean Square Error for LinearRegression() for Spanish is:  1.0929244441359431
Mean Square Error for LinearRegression() for Spanish after cross validation is:  [1.84200426 1.1078668  1.21662091 1.8859507  0.95033332]
************************************
Pearson's r for SVR() for Spanish is:  0.36678252471100736
Pearson's r for SVR() for Spanish after cross validation is:  [0.06486513 0.12807621 0.44012432 0.22669573 0.18106641]
Mean Square Error for SVR() for Spanish is:  0.8292925686890951
Mean Square Error for SVR() for Spanish after cross validation is:  [1.0579001  0.81495152 0.78585363 0.96845733 0.97144647]
************************************
Pearson's r for DecisionTreeRegressor() for Spanish is:  0.22619586874997552
Pearson's r for Decis

## Evaluating the performance on the test data

In [15]:
test = pd.read_csv('test.csv')
print(test)

english_test = test[test['language'] == 'English'] 
chinese_test = test[test['language'] == 'Chinese'] 
french_test = test[test['language'] == 'French'] 
italian_test = test[test['language'] == 'Italian'] 
portuguese_test = test[test['language'] == 'Portuguese'] 
spanish_test = test[test['language'] == 'Spanish'] 

                                                   text  label    language
0                                             @user 아..    3.0      Korean
1                                @user @user je rêve ??    2.2      French
2                                          thank u, nxt    1.0     English
3                            @user ma che cosa HO FATTO    2.6     Italian
4      在教室打飞机，站累了，就搬凳子坐下，站着坐着都是你爸爸，听爸爸的话哦！骚货，爸爸爱你。 http    3.0     Chinese
...                                                 ...    ...         ...
3876  @user Não sei se ele vai terminar com a Kyra (...    3.0  Portuguese
3877  @user Coitada...fraquinha....Povo precisa sabe...    1.4  Portuguese
3878  @user أي عطر حبيبي .... مساعدة بشار البعثي عطر ؟؟    3.5      Arabic
3879  TSC promoveert naar tweede klasse, ontgoocheli...    1.0       Dutch
3880  @user if you pull up to emmas house bald tomor...    1.8     English

[3881 rows x 3 columns]


In [16]:
def model_individual_lang_test(data, test, lang):
    #Feature extraction using TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    X = tfidf_vectorizer.fit_transform(data['text'])
    y = data['label']
    
    #Tran and validation split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_test = tfidf_vectorizer.transform(test['text'])
    y_test = test['label']
    
    #Model prediction and evaluation
    def models(mod, X_tr, y_tr, X_ts, y_ts, lang, name):
        model = mod
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_ts)

        pearson_r, _ = pearsonr(y_pred, y_ts)
        print("Pearson's r for", model, "for", lang, "on the test data is: " , pearson_r)

        def calculate_pearson(mod, X_ts, y_ts):
            y_pred_1 = mod.predict(X_ts)
            pearson, _ = pearsonr(y_pred_1, y_ts)
            return pearson

        pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
        print("Pearson's r for", model, "for", lang, "after cross validation on the test data is: " , pearson_cv)

        mse = mean_squared_error(y_pred, y_ts)
        print("Mean Square Error for", model, "for", lang, "on the test data is: " , mse)

        mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
        print("Mean Square Error for", model, "for", lang, "after cross validation on the test data is: " , mse_cv)
        
        return pearson_r, pearson_cv, mse, mse_cv 
    
    print("************************************")
    lr_name = "lr"
    lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test, lang, lr_name)
    print("************************************")
    svr_name = "svr"
    svr = models(svm.SVR(), X_train, y_train, X_test, y_test, lang, svr_name)
    print("************************************")
    dt_name = "dtr"
    dtr = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test, lang, dt_name)
    print("************************************")
    ridge_name = "ridge"
    ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test, lang, ridge_name)
    print("************************************")
    rf_name = "rfr"
    rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test, lang, rf_name)
    print("************************************")
    return X_train, y_train

In [17]:
lang = "English"
eng_t = model_individual_lang_test(english, english_test, lang)

************************************
Pearson's r for LinearRegression() for English on the test data is:  0.25309035017755827
Pearson's r for LinearRegression() for English after cross validation on the test data is:  [0.4742203  0.12121385 0.08408597 0.16580177 0.31293931]
Mean Square Error for LinearRegression() for English on the test data is:  1.0330091222487474
Mean Square Error for LinearRegression() for English after cross validation on the test data is:  [0.80852235 1.4752371  1.19546297 1.03371639 0.81797059]
************************************
Pearson's r for SVR() for English on the test data is:  0.28776645570759224
Pearson's r for SVR() for English after cross validation on the test data is:  [0.25114478 0.39891437 0.39659565 0.16489093 0.37716001]
Mean Square Error for SVR() for English on the test data is:  0.7007426111371773
Mean Square Error for SVR() for English after cross validation on the test data is:  [0.67322109 0.61902603 0.66581002 0.84608395 0.60028471]
****

In [18]:
lang = "Chinese"
chi_t = model_individual_lang_test(chinese, chinese_test, lang)

************************************
Pearson's r for LinearRegression() for Chinese on the test data is:  0.28652702279276365
Pearson's r for LinearRegression() for Chinese after cross validation on the test data is:  [0.14853214 0.20232069 0.22950454 0.23853127 0.2239131 ]
Mean Square Error for LinearRegression() for Chinese on the test data is:  0.8777633942544508
Mean Square Error for LinearRegression() for Chinese after cross validation on the test data is:  [0.77293608 0.95731121 0.61856902 0.76299226 0.69736849]
************************************
Pearson's r for SVR() for Chinese on the test data is:  0.2889306989023463
Pearson's r for SVR() for Chinese after cross validation on the test data is:  [0.22146928 0.24729495 0.25761977 0.13231776 0.16765183]
Mean Square Error for SVR() for Chinese on the test data is:  0.7591775029421168
Mean Square Error for SVR() for Chinese after cross validation on the test data is:  [0.74684692 0.9713804  0.64523191 0.84364364 0.77430868]
*****

In [19]:
lang = "Portuguese"
port_t = model_individual_lang_test(portuguese, portuguese_test, lang)

************************************
Pearson's r for LinearRegression() for Portuguese on the test data is:  0.23813392560254168
Pearson's r for LinearRegression() for Portuguese after cross validation on the test data is:  [0.02426591 0.18149588 0.22378394 0.21709504 0.07211893]
Mean Square Error for LinearRegression() for Portuguese on the test data is:  1.107483797461752
Mean Square Error for LinearRegression() for Portuguese after cross validation on the test data is:  [0.9685023  1.47424816 0.87658197 0.94608451 1.1912768 ]
************************************
Pearson's r for SVR() for Portuguese on the test data is:  0.38478894201990227
Pearson's r for SVR() for Portuguese after cross validation on the test data is:  [0.28728613 0.24711851 0.12795877 0.29224539 0.21117332]
Mean Square Error for SVR() for Portuguese on the test data is:  0.6236314278906393
Mean Square Error for SVR() for Portuguese after cross validation on the test data is:  [0.62984703 0.79866538 0.63861633 0.70

In [20]:
lang = "Italian"
ital_t = model_individual_lang_test(italian, italian_test, lang)

************************************
Pearson's r for LinearRegression() for Italian on the test data is:  0.30412377614431796
Pearson's r for LinearRegression() for Italian after cross validation on the test data is:  [0.06923299 0.11681153 0.078917   0.18818495 0.38670197]
Mean Square Error for LinearRegression() for Italian on the test data is:  0.8618201138323451
Mean Square Error for LinearRegression() for Italian after cross validation on the test data is:  [1.30665447 1.10853863 1.1433608  1.08305633 1.15509178]
************************************
Pearson's r for SVR() for Italian on the test data is:  0.44645573269051664
Pearson's r for SVR() for Italian after cross validation on the test data is:  [0.17386766 0.26826266 0.23851885 0.31785705 0.41199113]
Mean Square Error for SVR() for Italian on the test data is:  0.5873257167432065
Mean Square Error for SVR() for Italian after cross validation on the test data is:  [0.80255816 0.66594949 0.60243506 0.47789389 0.78068602]
****

In [21]:
lang = "French"
fre_t = model_individual_lang_test(french, french_test, lang)

************************************
Pearson's r for LinearRegression() for French on the test data is:  0.19022246523565967
Pearson's r for LinearRegression() for French after cross validation on the test data is:  [ 0.24319306 -0.21881811  0.13391583  0.07662261  0.01811357]
Mean Square Error for LinearRegression() for French on the test data is:  1.3421285921536226
Mean Square Error for LinearRegression() for French after cross validation on the test data is:  [1.60803485 2.62716945 1.11600732 2.19291769 7.12966633]
************************************
Pearson's r for SVR() for French on the test data is:  0.29110964369148623
Pearson's r for SVR() for French after cross validation on the test data is:  [0.22845109 0.30640966 0.231508   0.52559329 0.23333253]
Mean Square Error for SVR() for French on the test data is:  0.7218117890971402
Mean Square Error for SVR() for French after cross validation on the test data is:  [0.89156599 0.65304348 0.54994886 0.54370244 1.02341289]
*******

In [22]:
lang = "Spanish"
spn_t = model_individual_lang_test(spanish, spanish_test, lang)

************************************
Pearson's r for LinearRegression() for Spanish on the test data is:  0.3383086965789861
Pearson's r for LinearRegression() for Spanish after cross validation on the test data is:  [0.02172857 0.13733769 0.37450924 0.03909381 0.3194868 ]
Mean Square Error for LinearRegression() for Spanish on the test data is:  1.0812683444031872
Mean Square Error for LinearRegression() for Spanish after cross validation on the test data is:  [7.13887769 2.45772815 1.40587335 4.80069363 1.28225771]
************************************
Pearson's r for SVR() for Spanish on the test data is:  0.3640396384027338
Pearson's r for SVR() for Spanish after cross validation on the test data is:  [0.43727201 0.530787   0.52435601 0.40767761 0.45815462]
Mean Square Error for SVR() for Spanish on the test data is:  0.8364646227303587
Mean Square Error for SVR() for Spanish after cross validation on the test data is:  [0.93487749 0.7282111  0.67246512 0.73388284 0.6288387 ]
******