In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [26]:
df = pd.read_csv('Datasets/Preprocessed_df.csv')

In [27]:
df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,preprocessed_text
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,i think that would benefit from learning at th...
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,when a problem is a change you have to let it ...
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,dear principal if u change the school policy o...
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,the best time in life is when you become yours...
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,small act of kindness can impact in other peop...


In [28]:
df.drop(['full_text', 'text_id'], axis = 1, inplace = True)

In [29]:
df.head()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,preprocessed_text
0,3.5,3.5,3.0,3.0,4.0,3.0,i think that would benefit from learning at th...
1,2.5,2.5,3.0,2.0,2.0,2.5,when a problem is a change you have to let it ...
2,3.0,3.5,3.0,3.0,3.0,2.5,dear principal if u change the school policy o...
3,4.5,4.5,4.5,4.5,4.0,5.0,the best time in life is when you become yours...
4,2.5,3.0,3.0,3.0,2.5,2.5,small act of kindness can impact in other peop...


In [92]:
def choose_regressors(regressor_name = "linear_regression"):
    """
    Takes a regressor as input and returns a corresponding regressor object
    """
    
    if regressor_name == 'linear_regression':
        return LinearRegression()
    elif regressor_name == 'lasso':
        return Lasso()
    elif regressor_name == 'ridge':
        return Ridge()
    elif regressor_name == 'elastic_net':
        return ElasticNet()
    elif regressor_name == 'decision_tree_regression':
        return DecisionTreeRegressor()
    elif regressor_name == 'random_forest_regression':
        return RandomForestRegressor()
    elif regressor_name == 'gradient_boosting_regression':
        return GradientBoostingRegressor()
    elif regressor_name == 'adaboost_regression':
        return AdaBoostRegressor()
    elif regressor_name == 'k_neighbors_regression':
        return KNeighborsRegressor()
    elif regressor_name == 'support_vector_regression':
        return SVR()
    elif regressor_name == 'xgboost_regression':
        return XGBRegressor()
    elif regressor_name == 'gaussian_naive_bayes_regression':
        return GaussianNB()
    else:
        raise ValueError(f"Regressor {regressor_name} not supported for regression problems.")

In [59]:
def extract_bow(corpus, only_fit = True):
    """
    Takes a list of text documents as input and returns the BoW representation as a sparse matrix
    """
    vectorizer = CountVectorizer()
    if only_fit:
        vectorizer.fit(corpus)
        return vectorizer
    else:
        bow_representation = vectorizer.fit_transform(corpus)
        return vectorizer, bow_representation

In [72]:
def calc_mse(y_true, y_pred):
    """
    Calculates the mean squared error (MSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    return mse

def calc_mse(y_true, y_pred):
    """
    Calculates the mean squared error (MSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    return mse

def calc_mae(y_true, y_pred):
    """
    Calculates the mean absolute error (MAE) between the true and predicted values
    """
    mae = mean_absolute_error(y_true, y_pred)
    return mae

def calc_rmse(y_true, y_pred):
    """
    Calculates the root mean squared error (RMSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def calc_mape(y_true, y_pred):
    """
    Calculates the mean absolute percentage error (MAPE) between the true and predicted values
    """
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape

def calc_r2_score(y_true, y_pred):
    """
    Calculates the R2 score between the true and predicted values
    """
    r2 = r2_score(y_true, y_pred)
    return r2

In [78]:
# Calculate and print MSE
def print_metrics_function(y_actual, y_predictions):
    
    mse = calc_mse(y_actual, y_predictions)
    print("MSE:", mse)

    # Calculate and print MAE
    mae = calc_mae(y_actual, y_predictions)
    print("MAE:", mae)

    # Calculate and print RMSE
    rmse = calc_rmse(y_actual, y_predictions)
    print("RMSE:", rmse)

    # Calculate and print MAPE
    mape = calc_mape(y_actual, y_predictions)
    print("MAPE:", mape)

    # Calculate and print R2 score
    r2 = calc_r2_score(y_actual, y_predictions)
    print("R2 Score:", r2)

In [48]:
df.head()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,preprocessed_text
0,3.5,3.5,3.0,3.0,4.0,3.0,i think that would benefit from learning at th...
1,2.5,2.5,3.0,2.0,2.0,2.5,when a problem is a change you have to let it ...
2,3.0,3.5,3.0,3.0,3.0,2.5,dear principal if u change the school policy o...
3,4.5,4.5,4.5,4.5,4.0,5.0,the best time in life is when you become yours...
4,2.5,3.0,3.0,3.0,2.5,2.5,small act of kindness can impact in other peop...


In [49]:
cohesion = df['cohesion']
syntax = df['syntax']
vocabulary = df['vocabulary']
phraseology = df['phraseology']
grammar = df['grammar']
conventions = df['conventions']

preprocessed_text = df['preprocessed_text']

In [50]:
X = preprocessed_text
y_cohesion = cohesion
y_syntax = syntax
y_vocabulary = vocabulary
y_phraseology = phraseology
y_grammar = grammar
y_conventions = conventions

X_train, X_test, y_train_cohesion, y_test_cohesion = train_test_split(X, y_cohesion, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_syntax, y_test_syntax = train_test_split(X, y_syntax, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_vocabulary, y_test_vocabulary = train_test_split(X, y_vocabulary, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_phraseology, y_test_phraseology = train_test_split(X, y_phraseology, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_grammar, y_test_grammar = train_test_split(X, y_grammar, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_conventions, y_test_conventions = train_test_split(X, y_conventions, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

In [52]:
print("The shape of input train data: {}".format(X_train.shape))
print("The shape of input test data: {}".format(X_test.shape))
print("------------------------------------------")
print("The shape of output train data: {}".format(y_train_cohesion.shape))
print("The shape of output test data: {}".format(y_test_cohesion.shape))

The shape of input train data: (3128,)
The shape of input test data: (783,)
------------------------------------------
The shape of output train data: (3128,)
The shape of output test data: (783,)


### Bag-of-Words Vectorizer

In [61]:
bow_vectorizer, X_train_bow = extract_bow(X_train, only_fit = False)
X_test_bow = bow_vectorizer.transform(X_test)

### Models with Metrics (Cohesion)

In [None]:
print("-----------------------Linear Regression-----------------------")
model = choose_regressors("linear_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Lasso Regression-----------------------")
model = choose_regressors("lasso")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Ridge Regression-----------------------")
model = choose_regressors("ridge")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Elastic Net-----------------------")
model = choose_regressors("elastic_net")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Decision Tree Regression-----------------------")
model = choose_regressors("decision_tree_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Random Forest Regression-----------------------")
model = choose_regressors("random_forest_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Gradient Boosting Regression-----------------------")
model = choose_regressors("gradient_boosting_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Adaboost Regression-----------------------")
model = choose_regressors("adaboost_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------K Neighbors Regression-----------------------")
model = choose_regressors("k_neighbors_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Support Vector Regression-----------------------")
model = choose_regressors("support_vector_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

print("\n")
print("-----------------------Xgboost Regression-----------------------")
model = choose_regressors("xgboost_regression")
model.fit(X_train_bow, y_train_cohesion)
y_predictions = model.predict(X_test_bow)

print_metrics_function(y_test_cohesion, y_predictions)

-----------------------Linear Regression-----------------------
MSE: 2.0745833645395626
MAE: 1.0719633812525378
RMSE: 1.4403414055492407
MAPE: 35.13028359246971
R2 Score: -3.7255632105864898


-----------------------Lasso Regression-----------------------
MSE: 0.43681169710650053
MAE: 0.5354061017606635
RMSE: 0.6609173148787226
MAPE: 18.09968243611829
R2 Score: 0.005014056758111396


-----------------------Ridge Regression-----------------------
MSE: 1.006517121591656
MAE: 0.7769694932143353
RMSE: 1.0032532689165063
MAPE: 25.718056329965165
R2 Score: -1.292682165449916


-----------------------Elastic Net-----------------------
MSE: 0.42709132105093783
MAE: 0.5275566922629452
RMSE: 0.6535222422006292
MAPE: 17.85305382038154
R2 Score: 0.027155491161942935


-----------------------Decision Tree Regression-----------------------
MSE: 0.671455938697318
MAE: 0.6468710089399745
RMSE: 0.8194241506676002
MAPE: 21.621713393744045
R2 Score: -0.5294673309703737


-----------------------Random For