In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob

trainingSet = pd.read_csv("./data/train.csv")
testingSet = pd.read_csv("./data/test.csv") 


In [2]:
def sentiment_analysis(text):
# Extract sentiment of the review text'''
    blob = TextBlob(text)
    return blob.sentiment.polarity

In [5]:
import pandas as pd

def process(df):
    # This is where you can do all your processing

    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    df['ReviewLength'] = df.apply(lambda row : len(row['Text'].split()) if type(row['Text']) == str else 0, axis = 1)
    df["Combined_Review"] = df['Text'].astype(str) +" - "+ df["Summary"].astype(str)

    df['Text_Sentiments'] = df['Text'].astype(str).apply(sentiment_analysis)
    df['Summary_Sentiments'] = df['Summary'].astype(str).apply(sentiment_analysis)
    df['Combined_Sentiments'] = df['Combined_Review'].astype(str).apply(sentiment_analysis)

    # Implement tf-idf vectorizer
    tfidf = TfidfVectorizer(max_features=5000)
    tfidf.fit(df['Combined_Review'].values.astype('U'))

    text_features = tfidf.transform(df['Combined_Review'].values.astype('U'))
    df = pd.concat([df, pd.DataFrame(text_features.toarray(), columns=tfidf.get_feature_names())], axis=1)

    return df

# Load the dataset
trainingSet = pd.read_csv("./data/train.csv")

# Implementing tf-idf vectorizer 
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Process the DataFrame
train_processed = process(trainingSet)

# Load test set
submissionSet = pd.read_csv("./data/test.csv")

# Merge on Id so that the test set can have feature columns as well
testX= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
testX = testX.drop(columns=['Score_x'])
testX = testX.rename(columns={'Score_y': 'Score'})

# The training set is where the score is not null
trainX =  train_processed[train_processed['Score'].notnull()]

# Save the datasets with the new features for easy access later
testX.to_csv("./data/X_test.csv", index=False)
trainX.to_csv("./data/X_train.csv", index=False)




In [6]:
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# Load training set with new features into DataFrame
X_train = pd.read_csv("./data/X_train.csv")

# Split training set into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(
        X_train.drop(['Score'], axis=1),
        X_train['Score'],
        test_size=1/4.0,
        random_state=0
    )

# This is where you can do more feature selection
X_train_processed = X_train.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Time', 'Combined_Review'])
X_test_processed = X_test.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Time', 'Combined_Review'])

# model = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8)
# model.fit(X_train_processed, Y_train)

model = XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=8, subsample=0.7, colsample_bytree=0.8)
model.fit(X_train_processed, Y_train, early_stopping_rounds=10, eval_set=[(X_test_processed, Y_test)])


# pickle model - saves it so you can load it later
with open('xgboost_model.obj', 'wb') as f:
    pickle.dump(model, f)
# to load pickled model: 
# with open('filename', 'rb') as f:
#    model = pickle.load(f)

# Evaluate your model on the testing set
Y_test_predictions = model.predict(X_test_processed)
# print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_predictions))
print("RMSE on testing set = ", mean_squared_error(Y_test, Y_test_predictions, squared=False))


# Plot a confusion matrix
# cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
# sns.heatmap(cm, annot=True)
# plt.title('Confusion matrix of the classifier')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.show()



[0]	validation_0-rmse:3.77928
[1]	validation_0-rmse:3.74383
[2]	validation_0-rmse:3.70873
[3]	validation_0-rmse:3.67404
[4]	validation_0-rmse:3.63973
[5]	validation_0-rmse:3.60575
[6]	validation_0-rmse:3.57259
[7]	validation_0-rmse:3.53933
[8]	validation_0-rmse:3.50644
[9]	validation_0-rmse:3.47389
[10]	validation_0-rmse:3.44172
[11]	validation_0-rmse:3.41004
[12]	validation_0-rmse:3.37846
[13]	validation_0-rmse:3.34733
[14]	validation_0-rmse:3.31648
[15]	validation_0-rmse:3.28591
[16]	validation_0-rmse:3.25563
[17]	validation_0-rmse:3.22588
[18]	validation_0-rmse:3.19626
[19]	validation_0-rmse:3.16690
[20]	validation_0-rmse:3.13793
[21]	validation_0-rmse:3.10927
[22]	validation_0-rmse:3.08095
[23]	validation_0-rmse:3.05299
[24]	validation_0-rmse:3.02527
[25]	validation_0-rmse:2.99779
[26]	validation_0-rmse:2.97056
[27]	validation_0-rmse:2.94366
[28]	validation_0-rmse:2.91707
[29]	validation_0-rmse:2.89077
[30]	validation_0-rmse:2.86494
[31]	validation_0-rmse:2.83921
[32]	validation_0-

In [None]:
X_submission = pd.read_csv("./data/X_test.csv")
X_submission_processed = X_submission.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score', 'Time', 'Combined_Review'])

X_submission['Score'] = model.predict(X_submission_processed)
submission = X_submission[['Id', 'Score']]
submission.to_csv("./data/submission.csv", index=False)