In [1]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score



#### Data read-in

In [2]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


#### Cleaning the data

In [3]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

#data_all['text'] = data_all['text'].apply(clean)

#data_all.head()

#### Tokenizing and Lemmatization

In [4]:
nltk.download('punkt')
nltk.download('wordnet')

def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data_all['text'] = data_all['text'].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Creating datasets by language

In [5]:
english = data_all[data_all['language'] == 'English'] 
chinese = data_all[data_all['language'] == 'Chinese'] 
french = data_all[data_all['language'] == 'French'] 
italian = data_all[data_all['language'] == 'Italian'] 
portuguese = data_all[data_all['language'] == 'Portuguese'] 
spanish = data_all[data_all['language'] == 'Spanish'] 

#### Stopwords removal

In [6]:
from nltk.corpus import stopwords

# Download NLTK stop words
nltk.download('stopwords')

stop_words_english = set(stopwords.words('english'))
stop_words_chinese = set(stopwords.words('chinese'))
stop_words_french = set(stopwords.words('french'))
stop_words_italian = set(stopwords.words('italian'))
stop_words_portuguese = set(stopwords.words('portuguese'))
stop_words_spanish = set(stopwords.words('spanish'))
stop_words_arabic = set(stopwords.words('arabic'))
stop_words_dutch = set(stopwords.words('dutch'))
#stop_words_hindi = set(stopwords.words('hindi'))
#stop_words_korean = set(stopwords.words('korean'))


# Function to remove stop words for english
def remove_stopwords_english(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_english]
    return ' '.join(filtered_words)

#english['text'] = english['text'].apply(remove_stopwords_english)


# Function to remove stop words for chinese
def remove_stopwords_chinese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_chinese]
    return ' '.join(filtered_words)

#chinese['text'] = chinese['text'].apply(remove_stopwords_chinese)


# Function to remove stop words for french
def remove_stopwords_french(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_french]
    return ' '.join(filtered_words)

#french['text'] = french['text'].apply(remove_stopwords_french)


# Function to remove stop words for italian
def remove_stopwords_italian(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_italian]
    return ' '.join(filtered_words)

#italian['text'] = italian['text'].apply(remove_stopwords_italian)


# Function to remove stop words for portuguese
def remove_stopwords_portuguese(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_portuguese]
    return ' '.join(filtered_words)

#portuguese['text'] = portuguese['text'].apply(remove_stopwords_portuguese)


# Function to remove stop words for spanish
def remove_stopwords_spanish(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_spanish]
    return ' '.join(filtered_words)

#spanish['text'] = spanish['text'].apply(remove_stopwords_spanish)

# Function to remove stop words for arabic
def remove_stopwords_arabic(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_arabic]
    return ' '.join(filtered_words)

# Function to remove stop words for dutch
def remove_stopwords_dutch(text):
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words_dutch]
    return ' '.join(filtered_words)

# Function to remove stop words for hindi
#def remove_stopwords_hindi(text):
    
#    words = word_tokenize(text)
#    filtered_words = [word for word in words if word.lower() not in stop_words_hindi]
#    return ' '.join(filtered_words)

# Function to remove stop words for korean
#def remove_stopwords_korean(text):
    
#    words = word_tokenize(text)
#    filtered_words = [word for word in words if word.lower() not in stop_words_korean]
#    return ' '.join(filtered_words)



#data  = pd.concat([english, chinese, french, italian, portuguese, spanish])
data = data_all

print(data.head())
print("Num rows:", len(data))


                                                text  label language
0  wearing a fake engagement ring so guy won ’ t ...    1.8  English
1                              Bees vs. Wasps . http    1.0  English
2              Here is a nice equation : 0+0-0-0+0=0    1.0  English
3           @ user @ user Enjoy each new day ! 😊🇨🇦🐞🐭    1.6  English
4  I can be having a perfectly good day then I th...    1.6  English
Num rows: 9491


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Feature extraction using TF-IDF

In [7]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

#### Tran and Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Creating a function to predict and evaluate the prediction

In [9]:
def models(mod, X_tr, y_tr, X_ts, y_ts):
    model = mod
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_ts)
    
    pearson_r, _ = pearsonr(y_pred, y_ts)
    print("Pearson's r for", model, "is: " , pearson_r)
    
    def calculate_pearson(mod, X_ts, y_ts):
        y_pred_1 = mod.predict(X_ts)
        pearson, _ = pearsonr(y_pred_1, y_ts)
        return pearson
    
    pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
    print("Pearson's r for", model, "after cross validation is: " , pearson_cv)
    
    mse = mean_squared_error(y_pred, y_ts)
    print("Mean Square Error for", model, "is: " , mse)
      
    mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
    print("Mean Square Error for", model, "after cross validation is: " , mse_cv)
    
    return pearson_r, pearson_cv, mse, mse_cv

#### Linear Regression

In [10]:
lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test)

Pearson's r for LinearRegression() is:  0.23089700589113848
Pearson's r for LinearRegression() after cross validation is:  [0.22241299 0.29895702 0.25895541 0.27400264 0.31028882]
Mean Square Error for LinearRegression() is:  1.077527942861177
Mean Square Error for LinearRegression() after cross validation is:  [0.72403087 0.81785748 0.71789167 0.73061872 0.81930056]


#### Support Vector Regression

In [11]:
svr = models(svm.SVR(), X_train, y_train, X_test, y_test)

Pearson's r for SVR() is:  0.4127894693459429
Pearson's r for SVR() after cross validation is:  [0.30831422 0.30930053 0.29001737 0.34050096 0.3746493 ]
Mean Square Error for SVR() is:  0.6670610736297159
Mean Square Error for SVR() after cross validation is:  [0.6592582  0.79862815 0.68062319 0.69242026 0.8211113 ]


#### Decision Tree Regressor

In [12]:
dt = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Pearson's r for DecisionTreeRegressor() is:  0.20263964456394604
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.21787455 0.16428198 0.20434918 0.24173278 0.22503976]
Mean Square Error for DecisionTreeRegressor() is:  1.1960754602282024
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [1.11076202 1.45642333 1.14555344 1.17634284 1.32125916]


#### Ridge Regression

In [13]:
ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test)

Pearson's r for Ridge(alpha=0.9) is:  0.41045168257446313
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.28939259 0.31244344 0.29504246 0.34804303 0.36689302]
Mean Square Error for Ridge(alpha=0.9) is:  0.6613361135725191
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.66303318 0.7829483  0.66233566 0.67191569 0.78456122]


#### Random Forest Regressor

In [14]:
rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.3438763305236285
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.24358303 0.21107175 0.26506125 0.33971407 0.33833686]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.7570275702431948
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.81503506 1.01898314 0.83333417 0.77170308 0.94353945]


# Evaluation on the test data

In [15]:
test_data = pd.read_csv('test.csv')
print(test_data)

                                                   text  label    language
0                                             @user 아..    3.0      Korean
1                                @user @user je rêve ??    2.2      French
2                                          thank u, nxt    1.0     English
3                            @user ma che cosa HO FATTO    2.6     Italian
4      在教室打飞机，站累了，就搬凳子坐下，站着坐着都是你爸爸，听爸爸的话哦！骚货，爸爸爱你。 http    3.0     Chinese
...                                                 ...    ...         ...
3876  @user Não sei se ele vai terminar com a Kyra (...    3.0  Portuguese
3877  @user Coitada...fraquinha....Povo precisa sabe...    1.4  Portuguese
3878  @user أي عطر حبيبي .... مساعدة بشار البعثي عطر ؟؟    3.5      Arabic
3879  TSC promoveert naar tweede klasse, ontgoocheli...    1.0       Dutch
3880  @user if you pull up to emmas house bald tomor...    1.8     English

[3881 rows x 3 columns]


In [16]:
# Data Cleaning 
#test_data['text'] = test_data['text'].apply(clean)

# Tokenizing and Lemmatization
test_data['text'] = test_data['text'].apply(tokenize_and_lemmatize)

#Creating datasets by language
english_test = test_data[test_data['language'] == 'English'] 
chinese_test = test_data[test_data['language'] == 'Chinese'] 
french_test = test_data[test_data['language'] == 'French'] 
italian_test = test_data[test_data['language'] == 'Italian'] 
portuguese_test = test_data[test_data['language'] == 'Portuguese'] 
spanish_test = test_data[test_data['language'] == 'Spanish'] 
arabic_test = test_data[test_data['language'] == 'Arabic'] 
dutch_test = test_data[test_data['language'] == 'Dutch'] 
hindi_test = test_data[test_data['language'] == 'Hindi'] 
korean_test = test_data[test_data['language'] == 'Korean'] 

# Removing Stopwords
#english_test['text'] = english_test['text'].apply(remove_stopwords_english)
#chinese_test['text'] = chinese_test['text'].apply(remove_stopwords_chinese)
#french_test['text'] = french_test['text'].apply(remove_stopwords_french)
#italian_test['text'] = italian_test['text'].apply(remove_stopwords_italian)
#portuguese_test['text'] = portuguese_test['text'].apply(remove_stopwords_portuguese)
#spanish_test['text'] = spanish_test['text'].apply(remove_stopwords_spanish)
#arabic_test['text'] = arabic_test['text'].apply(remove_stopwords_arabic)
#dutch_test['text'] = dutch_test['text'].apply(remove_stopwords_dutch)
#hindi_test['text'] = hindi_test['text'].apply(remove_stopwords_hindi)
#korean_test['text'] = korean_test['text'].apply(remove_stopwords_korean)

#test  = pd.concat([english_test, chinese_test, french_test, italian_test, portuguese_test, spanish_test, arabic_test, dutch_test, hindi_test, korean_test])
test = test_data

# Feature extraction using TF-IDF
test_X = tfidf_vectorizer.transform(test['text'])
test_y = test['label']

#### Linear Regression

In [17]:
lr_test = models(linear_model.LinearRegression(), X_train, y_train, test_X, test_y)

Pearson's r for LinearRegression() is:  0.1466130222052112
Pearson's r for LinearRegression() after cross validation is:  [0.20851078 0.16861297 0.18241796 0.22750515 0.19299588]
Mean Square Error for LinearRegression() is:  1.4928472459008737
Mean Square Error for LinearRegression() after cross validation is:  [1.15868499 1.23329957 1.10916381 1.06707512 1.25833742]


#### Support Vector Regression

In [18]:
svr_test = models(svm.SVR(), X_train, y_train, test_X, test_y)

Pearson's r for SVR() is:  0.2298459746013823
Pearson's r for SVR() after cross validation is:  [0.35319267 0.31038057 0.33721598 0.32290689 0.3306959 ]
Mean Square Error for SVR() is:  0.9749033682013704
Mean Square Error for SVR() after cross validation is:  [0.88968995 0.89137452 0.80110303 0.8175126  0.84275587]


#### Decision Tree Regressor

In [19]:
dt_test = models(tree.DecisionTreeRegressor(), X_train, y_train, test_X, test_y)

Pearson's r for DecisionTreeRegressor() is:  0.18205010322063803
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.15654114 0.16338708 0.21649265 0.19948342 0.19191758]
Mean Square Error for DecisionTreeRegressor() is:  1.2375066454803945
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [1.2262826  1.25483461 1.13938939 1.14244206 1.19548184]


#### Ridge Regressor

In [20]:
ridge_test = models(linear_model.Ridge(alpha=0.9), X_train, y_train, test_X, test_y)

Pearson's r for Ridge(alpha=0.9) is:  0.21784341254799272
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.29606818 0.27051856 0.29168931 0.28057036 0.28893488]
Mean Square Error for Ridge(alpha=0.9) is:  1.018905446864162
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.88314438 0.89725691 0.824735   0.83063461 0.85828462]


#### Random Forest Regressor

In [21]:
rf_test = models(RandomForestRegressor(random_state=42), X_train, y_train, test_X, test_y)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.26227367783835914
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.28657197 0.27730172 0.28656303 0.30693723 0.22101396]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.9595807549608741
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.92961157 0.94451142 0.85728764 0.85166243 0.94838757]


In [22]:
model = svm.SVR()
model.fit(X_train, y_train)
y_pred = model.predict(test_X)
    
test_with_pred = pd.DataFrame({
        'text': test['text'],
        'language': test['language'],
        'predicted_label': y_pred,
        'true_label': test_y
        })

#### Pearson R for each language

In [23]:
def calculate_pearsonr_for_diff_lang(language, data):
    language_with_pred = test_with_pred[test_with_pred['language'] == language]
    pearson, _ = pearsonr(language_with_pred['predicted_label'], language_with_pred['true_label'])
    
    return pearson

# Example usage:
languages = set(test_with_pred['language'])

for lang in languages:
    correlation = calculate_pearsonr_for_diff_lang(lang, test_with_pred)
    print(f"Pearson correlation for {lang} with SVR is {correlation}")

Pearson correlation for Korean with SVR is -0.04630847800407837
Pearson correlation for Arabic with SVR is 0.1715946218286482
Pearson correlation for Portuguese with SVR is 0.44983401762435216
Pearson correlation for English with SVR is 0.40963944670035474
Pearson correlation for French with SVR is 0.45503915202838935
Pearson correlation for Spanish with SVR is 0.5242129956623359
Pearson correlation for Italian with SVR is 0.416692357071758
Pearson correlation for Hindi with SVR is 0.17567025158467825
Pearson correlation for Dutch with SVR is 0.19130633387004328
Pearson correlation for Chinese with SVR is -0.005174803024200055
