In [1]:
import pandas as pd
import numpy as np
import nltk

#Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#Models from sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

#Evaluation metrics
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score



In [2]:
data_all = pd.read_csv('train.csv')
print(data_all)

                                                   text  label language
0     wearing a fake engagement ring so guys won’t a...    1.8  English
1                                  Bees vs. Wasps. http    1.0  English
2                  Here is a nice equation: 0+0-0-0+0=0    1.0  English
3                  @user @user Enjoy each new day!😊🇨🇦🐞🐭    1.6  English
4     I can be having a perfectly good day then I th...    1.6  English
...                                                 ...    ...      ...
9486  若被確認為「國際關注公共衛生緊急事件」， 世衛會發布一系列包括確診、隔離和治療的詳細計畫， ...    1.0  Chinese
9487                                @user 是嗎？ 可能我沒有注意到吧    2.0  Chinese
9488                                 @user @user 你剃过毛毛吗    3.8  Chinese
9489                                      @user 她没说是捐吧？    1.8  Chinese
9490  通报来了 真的要消停一会了 视频不要私信要啦 就当2w粉的福利提前放出来吧 有风险勿模仿 感...    1.6  Chinese

[9491 rows x 3 columns]


In [3]:
data = data_all[data_all['language'] == 'English'] 
data

Unnamed: 0,text,label,language
0,wearing a fake engagement ring so guys won’t a...,1.8,English
1,Bees vs. Wasps. http,1.0,English
2,Here is a nice equation: 0+0-0-0+0=0,1.0,English
3,@user @user Enjoy each new day!😊🇨🇦🐞🐭,1.6,English
4,I can be having a perfectly good day then I th...,1.6,English
...,...,...,...
1582,PSA!!! Even though I’m busy 99.99999% of the t...,2.2,English
1583,@user @OtterBox Isnt that the only reason we b...,1.6,English
1584,#NetajiSubhasChandraBose The ART The ARTIST http,1.0,English
1585,Nothing compares with being with someone who a...,3.6,English


In [15]:
import re

def clean(text):
    
    text = re.sub('@user', '', text)
    text = re.sub('http', '', text)
    text = re.sub('@[\w]+', '', text)
    text = text.strip()
    return text

data['text'] = data['text'].apply(clean)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(clean)


Unnamed: 0,text,label,language
0,wearing fake engagement ring guy ’ approach li...,1.8,English
1,Bees vs. Wasps .,1.0,English
2,nice equation : 0+0-0-0+0=0,1.0,English
3,Enjoy new day ! 😊🇨🇦🐞🐭,1.6,English
4,perfectly good day think Star Wars episode 9 ’...,1.6,English


In [5]:
nltk.download('punkt')
nltk.download('wordnet')

#def stemming(text):
#    words = word_tokenize(text)
#    stemmer = PorterStemmer()
#    stemmed_words = [stemmer.stem(word) for word in words]
#    return ' '.join(stemmed_words)

#data['text'] = data['text'].apply(stemming)

def tokenize_and_lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['text'] = data['text'].apply(tokenize_and_lemmatize)





[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(tokenize_and_lemmatize)


In [6]:
from nltk.corpus import stopwords

# Download NLTK stop words
nltk.download('stopwords')

# Get the English stop words from NLTK
stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the 'remove_stopwords' function to the 'text' column
data['text'] = data['text'].apply(remove_stopwords)

# Display the updated DataFrame
print(data.head())


                                                text  label language
0  wearing fake engagement ring guy ’ approach li...    1.8  English
1                                   Bees vs. Wasps .    1.0  English
2                        nice equation : 0+0-0-0+0=0    1.0  English
3                              Enjoy new day ! 😊🇨🇦🐞🐭    1.6  English
4  perfectly good day think Star Wars episode 9 ’...    1.6  English


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenarnoldkappala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(remove_stopwords)


In [7]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

In [8]:
# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
def models(mod, X_tr, y_tr, X_ts, y_ts):
    model = mod
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_ts)
    
    pearson_r, _ = pearsonr(y_pred, y_ts)
    print("Pearson's r for", model, "is: " , pearson_r)
    
    def calculate_pearson(mod, X_ts, y_ts):
        y_pred_1 = mod.predict(X_ts)
        pearson, _ = pearsonr(y_pred_1, y_ts)
        return pearson
    
    pearson_cv = cross_val_score(model, X_ts, y_ts, scoring=calculate_pearson, cv=5)
    print("Pearson's r for", model, "after cross validation is: " , pearson_cv)
    
    mse = mean_squared_error(y_pred, y_ts)
    print("Mean Square Error for", model, "is: " , mse)
      
    mse_cv = -cross_val_score(model, X_ts, y_ts, scoring='neg_mean_squared_error', cv=5)
    print("Mean Square Error for", model, "after cross validation is: " , mse_cv)
    
    return pearson_r, mse, mse_cv

In [10]:
lr = models(linear_model.LinearRegression(), X_train, y_train, X_test, y_test)

Pearson's r for LinearRegression() is:  0.4222047721812297
Pearson's r for LinearRegression() after cross validation is:  [0.34404329 0.46898081 0.43738794 0.46892611 0.30006956]
Mean Square Error for LinearRegression() is:  0.6772482482289995
Mean Square Error for LinearRegression() after cross validation is:  [0.55504586 0.40989142 1.00114435 0.65737452 1.05343783]


In [11]:
svr = models(svm.SVR(), X_train, y_train, X_test, y_test)

Pearson's r for SVR() is:  0.45948874220282154
Pearson's r for SVR() after cross validation is:  [0.2738383  0.35311762 0.36260323 0.4394647  0.329581  ]
Mean Square Error for SVR() is:  0.6140461505151605
Mean Square Error for SVR() after cross validation is:  [0.49303335 0.47747604 0.933174   0.6563515  0.95911158]


In [12]:
dt = models(tree.DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Pearson's r for DecisionTreeRegressor() is:  0.3293123137041315
Pearson's r for DecisionTreeRegressor() after cross validation is:  [0.23785693 0.3533487  0.21434633 0.37419039 0.25596965]
Mean Square Error for DecisionTreeRegressor() is:  0.8599256264350607
Mean Square Error for DecisionTreeRegressor() after cross validation is:  [0.80427951 0.77778103 1.45796007 1.09898258 1.43488977]


In [13]:
ridge = models(linear_model.Ridge(alpha=0.9), X_train, y_train, X_test, y_test)

Pearson's r for Ridge(alpha=0.9) is:  0.4589978597917358
Pearson's r for Ridge(alpha=0.9) after cross validation is:  [0.31829503 0.37713078 0.39652235 0.50461995 0.33602665]
Mean Square Error for Ridge(alpha=0.9) is:  0.5967985225645481
Mean Square Error for Ridge(alpha=0.9) after cross validation is:  [0.4802417  0.48668896 0.84598867 0.5938708  0.87652238]


In [14]:
rf = models(RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test)

Pearson's r for RandomForestRegressor(random_state=42) is:  0.45655435521371857
Pearson's r for RandomForestRegressor(random_state=42) after cross validation is:  [0.26134088 0.421299   0.33790317 0.47920137 0.34015393]
Mean Square Error for RandomForestRegressor(random_state=42) is:  0.6020618823370005
Mean Square Error for RandomForestRegressor(random_state=42) after cross validation is:  [0.61139428 0.44785449 1.05571412 0.6433565  1.0540422 ]
