In [0]:
# Download modules and install necessary packages

%pip install nltk

Python interpreter will be restarted.
Python interpreter will be restarted.


#Import modules

In [0]:
# Import modules needed for TF-IDF and data preprocessing

import nltk
from pyspark.sql import SparkSession
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
stop = stopwords.words('english')
spark = SparkSession.builder.appName("612_Proj").config("spark.task.cpus", "2").getOrCreate()
from nltk.tokenize import word_tokenize
from pyspark.sql.functions import udf, split, col, concat_ws
from pyspark.sql.types import ArrayType, StringType, IntegerType
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from pyspark.ml.feature import IDF, CountVectorizer,VectorAssembler

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
# This function loads a CSV file into a dataframe
#
# Parameters:
#    filename: name of the file to be opened
#
# Returns:
#    a pyspark dataframe
#
def csv_to_df(fname):
    
    #Location of dbfs
    dbfs_loc = "dbfs:/FileStore/shared_uploads/sam.rainbow@ucalgary.ca/"
    filename_complete = dbfs_loc + fname
    filetype="csv"
    
    #Options for loading
    inf_sch = "true"
    is_header = "true"
    delim = ","
    multiline = "true"
    escape = "\""
    
    # Load the data into a dataframe using options above
    df = spark.read.format(filetype)\
                .option("header", is_header)\
                .option("inferSchema", inf_sch)\
                .option("multiline", multiline)\
                .option("escape", escape).load(filename_complete)
    
    return df

In [0]:
# Load the CSV in to a dataframe
all_reviews = csv_to_df("IMDB_dataset_rated_final.csv")

# Concatenate review title and review content with space inbetween
all_reviews = all_reviews.withColumn("Complete_Content", concat_ws(" ", "Review Title", "Review Content"))

In [0]:
# Cleans the text by removing non-alphabetic characters and whitespace
def clean_text(text):
    text = ''.join(c for c in text if c.isalpha() or c.isspace())
    text = text.lower()
    return text

# Function to remove stop words by tokenizing the words and comparing it to words in the nltk corpus
def remove_stopwords(text):
    stop = set(nltk.corpus.stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop]
    
    return filtered_tokens

# Function to lemmatize tokens with POS using WordNetLemmatizer
def lemmatize_text_pos(filtered_tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tagged = nltk.pos_tag(filtered_tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) if get_wordnet_pos(pos) else token for token, pos in pos_tagged]
    return lemmatized_tokens

# This function completes stemming on the tokenized text using snowball stemmer
def stemmer_snowball(filtered_tokens):
    snowball = SnowballStemmer(language = 'english')
    print(filtered_tokens)
    stemmed_words = [snowball.stem(word) for word in filtered_tokens]
    return stemmed_words

# Helper function to get the WordNet POS tag from the NLTK POS tag
def get_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return 'a'
    elif nltk_pos_tag.startswith('V'):
        return 'v'
    elif nltk_pos_tag.startswith('N'):
        return 'n'
    elif nltk_pos_tag.startswith('R'):
        return 'r'
    else:
        return None

# Function that applies the cleaning, stop word removal, and lemmatization functions
def preprocess_text(text):
    cleaned = clean_text(text)
    filtered = remove_stopwords(cleaned)
    lemmatized = lemmatize_text_pos(filtered)
    return lemmatized

# Function that applies cleaning, remove stopwords, and stemming
def preprocess_text_stemming(text):
    cleaned = clean_text(text)
    filtered = remove_stopwords(cleaned)
    stemmed = stemmer_snowball(filtered)
    return stemmed

# Default of preprocessing text using lemmatization
preprocess_text_udf = udf(preprocess_text, ArrayType(StringType()))

# Preprocessing text using snowball stemming
preprocess_text_udf_stemmed = udf(preprocess_text_stemming, ArrayType(StringType()))

# Apply the user-defined function to the "Review Content" column and create a new column "lemmatized_text" and "stemmed_text"
df = all_reviews.withColumn("lemmatized_text", preprocess_text_udf("Complete_Content"))
df = df.withColumn("stemmed_text", preprocess_text_udf_stemmed("Complete_Content"))

In [0]:
# Create a new column "Month" by splitting the "Review Date" column on dash and accessing the second element
df = df.withColumn("Month", split(col("Review Date"), "-")[1])

In [0]:
# Converts the lemmatized text to a vector and fits it to and IDF function

# Create a CountVectorizer object for the lemmatized words
cv = CountVectorizer(inputCol="lemmatized_text", outputCol="lemmatized_text_vector")

# Fit the CountVectorizer model on the dataframe
cv_model = cv.fit(df)

# Transform the dataframe to add the count vectorized features column
count_vectorized_df = cv_model.transform(df)

# Define an IDF function to compute the inverse document frequency on the lemmatized text
idf = IDF(inputCol="lemmatized_text_vector", outputCol="TFIDF_features_uni-gram")

# Fit the IDF function to the dataframe to compute the IDF values
idf_model = idf.fit(count_vectorized_df)

# Apply the IDF model to the dataframe to create a new column with the normalized features
df_tfidf = idf_model.transform(count_vectorized_df)


In [0]:
# Converts the Stemmed text to a vector and fits it to and IDF function

# Create a CountVectorizer object for the lemmatized words
cv = CountVectorizer(inputCol="stemmed_text", outputCol="stemmed_text_vector")

# Fit the CountVectorizer model on the dataframe
cv_model = cv.fit(df_tfidf)

# Transform the dataframe to add the count vectorized features column
count_vectorized_df = cv_model.transform(df_tfidf)

# define an IDF function to compute the inverse document frequency
idf = IDF(inputCol="stemmed_text_vector", outputCol="TFIDF_features_uni-gram_stemmed")

# fit the IDF function to the dataframe to compute the IDF values
idf_model = idf.fit(count_vectorized_df)

# apply the IDF model to the dataframe to create a new column with the normalized features
df_tfidf = idf_model.transform(count_vectorized_df)

In [0]:
# Retrieves just the month from the dataframe and vectorizes it using CountVectorizer

# Retrieves just the month by splitting on the ","
count_vectorized_df = df_tfidf.withColumn("Month_Vector", split(count_vectorized_df["Month"], ","))

# Create a CountVectorizer object for the month
cv = CountVectorizer(inputCol="Month_Vector", outputCol="Month_vector")

# Fit the CountVectorizer model on the dataframe
cv_model = cv.fit(count_vectorized_df)

# Transform the dataframe to add the count vectorized features column
count_vectorized_df = cv_model.transform(count_vectorized_df)

In [0]:
# Seperate the genre into a new column and vectorizes the genre similar to the month vectorization above

# Seperate genre into new column and preprocesses it
count_vectorized_df = count_vectorized_df.withColumn("genre_lemmatized", preprocess_text_udf("Movie Genre"))

# Create a CountVectorizer object for the genre
cv = CountVectorizer(inputCol="genre_lemmatized", outputCol="genre_lemmatized_vector")

# Fit the CountVectorizer model on the dataframe
cv_model = cv.fit(count_vectorized_df)

# Transform the dataframe to add the count vectorized features column
count_vectorized_df = cv_model.transform(count_vectorized_df)

In [0]:
# Assembles all feature vectors created above into one using vector assembler (month, genre, and TFIDF)

# Create a VectorAssembler object for both the stemmed and lemmatized text
assembler = VectorAssembler(inputCols=["TFIDF_features_uni-gram", "Month_vector", "genre_lemmatized_vector"], outputCol="combined_vectors")
assembler1 = VectorAssembler(inputCols=["TFIDF_features_uni-gram_stemmed", "Month_vector", "genre_lemmatized_vector"], outputCol="combined_vectors_stem")

# Apply the assembler to the DataFrame
assembled_df = assembler.transform(count_vectorized_df)
assembled_df = assembler1.transform(assembled_df)


In [0]:
# Creates a data frame that only contains the data the TFIDF model will use (combined_vectors) . The "Complete_Content" column is included to be able to compare the models predictions back to the manually labelled sentiment to be able to address misclassification.

assembled_df = assembled_df.select(assembled_df["combined_vectors"],assembled_df["Manual_Combined"],assembled_df["Complete_Content"],assembled_df["combined_vectors_stem"])

In [0]:
# Function that increase the raters combined sentiment from a scale of -1,0,1 to 0,1,2 by incrementing each score by 1.

from pyspark.sql.functions import col

# This function increases a column value by 1. 
#   Requires the dataframe and the name of the column to be increased.
#   Returns a new df.
def increment_column(df, column_name):
    incremented_col = col(column_name) + 1
    new_df = df.withColumn(column_name, incremented_col)
    
    return new_df

In [0]:
# Calls the increment_column function to increase the manual rating by 1
assembled_df = increment_column(assembled_df, "Manual_Combined")

In [0]:
# Splits the data into a training and testing set based on a 70/30 split as is a commonly used ratio.
(training, testing) = assembled_df.randomSplit([0.7, 0.3])

# ML Model Training and Evaluation

#1. Naive Bayes

####     Using Lemmatization

In [0]:
# Import the necessary models
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Create initial Naïve Bayes model
nb = NaiveBayes(labelCol="Manual_Combined", featuresCol="combined_vectors", modelType="multinomial")

# Evaluate model using a MulticlassificationEvaluator
nbevaluator = MulticlassClassificationEvaluator(labelCol="Manual_Combined", metricName="weightedFMeasure")

# Set the parameters to be passed to the model in the GridBuilder
nb_params = [0.2,0.5, 0.7, 0.9]

# Initialize empty lists to house the scores of the model with the given parameters
nb_f1_scores =[]
nb_recall_scores = []
nb_precision_scores = []

# This loops through the parameters and fits the model using the parameters specified above
for i in nb_params:
    
    #Create ParamGrid for Cross Validation
    nbparamGrid = (ParamGridBuilder()
                   .addGrid(nb.smoothing, [i])
                   .build())


    #Create 5-fold CrossValidator
    nbcv = CrossValidator(estimator = nb,
                        estimatorParamMaps = nbparamGrid,
                        evaluator = nbevaluator,
                        numFolds = 5)
    
    # Fit the model and creat a prediction
    nbcvModel = nbcv.fit(training)
    prediction = nbcvModel.transform(testing).cache()

    # The three code snippets below populate the evaluation score lists initialized above.
    evaluator_f1_nb = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedFMeasure")
    nb_f1_score = evaluator_f1_nb.evaluate(prediction)
    nb_f1_scores.append(nb_f1_score)

    evaluator_recall_nb = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedRecall")
    nb_recall_score = evaluator_recall_nb.evaluate(prediction, {evaluator_recall_nb.metricName: "weightedRecall"})
    nb_recall_scores.append(nb_recall_score)

    evaluator_precision_nb = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedPrecision")
    nb_precision_score = evaluator_precision_nb.evaluate(prediction, {evaluator_precision_nb.metricName: "weightedPrecision"})
    nb_precision_scores.append(nb_precision_score)

In [0]:
# This code prints the 

total_result = []
for i in range(4):
    templist = []
    templist.append(nb_params[i])
    templist.append(nb_f1_scores[i])
    templist.append(nb_recall_scores[i])
    templist.append(nb_precision_scores[i])
    total_result.append(templist)

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("Alpha", StringType(), True),
    StructField("f1_score", StringType(), True),
    StructField("recall_score", StringType(), True),
    StructField("precision_score", StringType(), True)
])

df_nb_results = spark.createDataFrame(data=total_result, schema=schema)
df_nb_results.display()

Alpha,f1_score,recall_score,precision_score
0.2,0.610319694984813,0.6101694915254238,0.6121079133760268
0.5,0.6194127461694763,0.6169491525423729,0.6272766502685441
0.7,0.6208831898351024,0.6169491525423729,0.6326265733045394
0.9,0.6140400124153416,0.6101694915254238,0.6262733574760508


#### Stemming

In [0]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Create initial Naïve Bayes model
nb = NaiveBayes(labelCol="Manual_Combined", featuresCol="combined_vectors_stem", modelType="multinomial")


# Evaluate model using a MulticlassificationEvaluator
nbevaluator = MulticlassClassificationEvaluator(labelCol="Manual_Combined", metricName="weightedFMeasure")

# Set the parameters to be passed to the model in the GridBuilder
nb_params = [0.2,0.5, 0.7, 0.9]

# Initialize empty lists to house the scores of the model with the given parameters
nb_f1_scores =[]
nb_recall_scores = []
nb_precision_scores = []

# This loops through the parameters and fits the model using the parameters specified above
for i in nb_params:
    
    #Create ParamGrid for Cross Validation
    nbparamGrid = (ParamGridBuilder()
                   .addGrid(nb.smoothing, [i])
                   .build())


    #Create 5-fold CrossValidator
    nbcv = CrossValidator(estimator = nb,
                        estimatorParamMaps = nbparamGrid,
                        evaluator = nbevaluator,
                        numFolds = 5)
    
    # Fit the model and creat a prediction
    nbcvModel = nbcv.fit(training)
    prediction = nbcvModel.transform(testing).cache()

    # The three code snippets below populate the evaluation score lists initialized above.
    evaluator_f1_nb = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedFMeasure")
    nb_f1_score = evaluator_f1_nb.evaluate(prediction)
    nb_f1_scores.append(nb_f1_score)

    evaluator_recall_nb = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedRecall")
    nb_recall_score = evaluator_recall_nb.evaluate(prediction, {evaluator_recall_nb.metricName: "weightedRecall"})
    nb_recall_scores.append(nb_recall_score)

    evaluator_precision_nb = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedPrecision")
    nb_precision_score = evaluator_precision_nb.evaluate(prediction, {evaluator_precision_nb.metricName: "weightedPrecision"})
    nb_precision_scores.append(nb_precision_score)

In [0]:
total_result = []
for i in range(3):
    templist = []
    templist.append(nb_params[i])
    templist.append(nb_f1_scores[i])
    templist.append(nb_recall_scores[i])
    templist.append(nb_precision_scores[i])
    total_result.append(templist)

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("Alpha", StringType(), True),
    StructField("f1_score", StringType(), True),
    StructField("recall_score", StringType(), True),
    StructField("precision_score", StringType(), True)
])

df_nb_results = spark.createDataFrame(data=total_result, schema=schema)
df_nb_results.display()

Alpha,f1_score,recall_score,precision_score
0.2,0.5779258611209561,0.5830508474576271,0.578988635428823
0.5,0.5818780722540237,0.5830508474576271,0.5862975850977501
0.7,0.5777112806533977,0.5796610169491525,0.5807747676429017


#2. Logistic Regression

#### Lemmatization

In [0]:
from pyspark.ml.classification import LogisticRegression

# Create initial Logistic Regression model
lr = LogisticRegression(labelCol="Manual_Combined", featuresCol="combined_vectors")

# Evaluate model using a MulticlassificationEvaluator
lrevaluator = MulticlassClassificationEvaluator(labelCol="Manual_Combined", metricName="weightedFMeasure")

# Set the parameters to be passed to the model in the GridBuilder
params_lr = [0.005, 0.008,0.001]

# Initialize empty lists to house the scores of the model with the given parameters
f1_scores_lr =[]
recall_scores_lr = []
precision_scores_lr = []

# This loops through the parameters and fits the model using the parameters specified above
for i in params_lr:
    # Create ParamGrid for Cross Validation
    lrparamGrid = (ParamGridBuilder()
                   .addGrid(lr.regParam, [i])
                   .build())


    # Create 5-fold CrossValidator
    lrcv = CrossValidator(estimator = lr,
                        estimatorParamMaps = lrparamGrid,
                        evaluator = lrevaluator,
                        numFolds = 5)
    
    # Fit the model and create a prediction
    lrcvModel = lrcv.fit(training)
    prediction = lrcvModel.transform(testing).cache()

    # The three code snippets below populate the evaluation score lists initialized above.
    evaluator_f1_lr = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedFMeasure")
    lr_f1_score = evaluator_f1_lr.evaluate(prediction)
    f1_scores_lr.append(lr_f1_score)

    evaluator_recall_lr = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedRecall")
    lr_recall_score = evaluator_recall_lr.evaluate(prediction, {evaluator_recall_lr.metricName: "weightedRecall"})
    recall_scores_lr.append(lr_recall_score)

    evaluator_precision_lr = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedPrecision")
    lr_precision_score = evaluator_precision_lr.evaluate(prediction, {evaluator_precision_lr.metricName: "weightedPrecision"})
    precision_scores_lr.append(lr_precision_score)

In [0]:
lr_total_result = []
for i in range(3):
    templist = []
    templist.append(params_lr[i])
    templist.append(f1_scores_lr[i])
    templist.append(recall_scores_lr[i])
    templist.append(precision_scores_lr[i])
    lr_total_result.append(templist)

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("Alpha", StringType(), True),
    StructField("f1_score", StringType(), True),
    StructField("recall_score", StringType(), True),
    StructField("precision_score", StringType(), True)
])

df_nb_results = spark.createDataFrame(data=lr_total_result, schema=schema)
df_nb_results.display()

Alpha,f1_score,recall_score,precision_score
0.005,0.6426088447607181,0.688135593220339,0.6434385227363435
0.008,0.6447928280045218,0.6915254237288135,0.6513013916403747
0.001,0.6385692439575567,0.6813559322033899,0.6300033077644537


### Stemming

In [0]:
# Create initial Logistic Regression model
lr = LogisticRegression(labelCol="Manual_Combined", featuresCol="combined_vectors_stem")

# Evaluate model using a MulticlassificationEvaluator
lrevaluator = MulticlassClassificationEvaluator(labelCol="Manual_Combined", metricName="weightedFMeasure")

# Set the parameters to be passed to the model in the GridBuilder
params_lr = [0.005, 0.008,0.001]

# Initialize empty lists to house the scores of the model with the given parameters
f1_scores_lr =[]
recall_scores_lr = []
precision_scores_lr = []

# This loops through the parameters and fits the model using the parameters specified above
for i in params_lr:
    # Create ParamGrid for Cross Validation
    lrparamGrid = (ParamGridBuilder()
                   .addGrid(lr.regParam, [i])
                   .build())


    # Create 5-fold CrossValidator
    lrcv = CrossValidator(estimator = lr,
                        estimatorParamMaps = lrparamGrid,
                        evaluator = lrevaluator,
                        numFolds = 5)
    
    # Fit the model and create a prediction
    lrcvModel = lrcv.fit(training)
    prediction = lrcvModel.transform(testing).cache()

    # The three code snippets below populate the evaluation score lists initialized above.
    evaluator_f1_lr = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedFMeasure")
    lr_f1_score = evaluator_f1_lr.evaluate(prediction)
    f1_scores_lr.append(lr_f1_score)

    evaluator_recall_lr = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedRecall")
    lr_recall_score = evaluator_recall_lr.evaluate(prediction, {evaluator_recall_lr.metricName: "weightedRecall"})
    recall_scores_lr.append(lr_recall_score)

    evaluator_precision_lr = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedPrecision")
    lr_precision_score = evaluator_precision_lr.evaluate(prediction, {evaluator_precision_lr.metricName: "weightedPrecision"})
    precision_scores_lr.append(lr_precision_score)


In [0]:
lr_total_result = []
for i in range(3):
    templist = []
    templist.append(params_lr[i])
    templist.append(f1_scores_lr[i])
    templist.append(recall_scores_lr[i])
    templist.append(precision_scores_lr[i])
    lr_total_result.append(templist)

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("Alpha", StringType(), True),
    StructField("f1_score", StringType(), True),
    StructField("recall_score", StringType(), True),
    StructField("precision_score", StringType(), True)
])

df_nb_results = spark.createDataFrame(data=lr_total_result, schema=schema)
df_nb_results.display()

Alpha,f1_score,recall_score,precision_score
0.005,0.6669253844994709,0.6983050847457627,0.6797047555939275
0.008,0.6694688629396821,0.7016949152542373,0.6866921428464385
0.001,0.650238599987067,0.6813559322033897,0.663986328894413


#2. RANDOM FOREST

### Lemmatization|

In [0]:
from pyspark.ml.classification import RandomForestClassifier

# RandomForestClassifier
rf = RandomForestClassifier(labelCol="Manual_Combined", featuresCol="combined_vectors")

# Evaluate model using a MulticlassificationEvaluator
rf_evaluator = MulticlassClassificationEvaluator(labelCol="Manual_Combined", metricName="weightedFMeasure")

# Set the parameters to be passed to the model in the GridBuilder
params_rf_trees = [10, 30, 50]
params_rf_maxdepth = [10, 15, 20]

# Initialize empty lists to house the scores of the model with the given parameters
i=0
f1_scores_rf =[]
recall_scores_rf = []
precision_scores_rf = []

# This loops through the parameters and fits the model using the parameters specified above
for depth in params_rf_maxdepth:
    j=0
    #iterating through number of trees
    for trees in params_rf_trees:
        # Create ParamGrid for Cross Validation
        rfparamGrid = (ParamGridBuilder()
                       .addGrid(rf.numTrees, [trees]) \
        .addGrid(rf.maxDepth, [depth]) \
                       .build())


        # Create 5-fold CrossValidator
        rfcv = CrossValidator(estimator = rf,
                            estimatorParamMaps = rfparamGrid,
                            evaluator = rf_evaluator,
                            numFolds = 5)
        rfcvModel = rfcv.fit(training)

        prediction = rfcvModel.transform(testing).cache()

        #Evaluating F1 score
        evaluator_f1_rf = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedFMeasure")
        f1_score = evaluator_f1_rf.evaluate(prediction)
        f1_scores_rf.append(f1_score)
        
        #evaluating recall score
        evaluator_recall_rf = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedRecall")
        recall_score = evaluator_recall_rf.evaluate(prediction, {evaluator_recall_lr.metricName: "weightedRecall"})
        recall_scores_rf.append(recall_score)

        #evaluating precision score
        evaluator_precision_rf = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedPrecision")
        precision_score = evaluator_precision_rf.evaluate(prediction, {evaluator_precision_lr.metricName: "weightedPrecision"})
        precision_scores_rf.append(precision_score)


In [0]:
x = 0
for i in range(3):
    for j in range(3):
        print("Max depth is : " + str(params_rf_maxdepth[i]))
        print("Number of trees is: " + str(params_rf_trees[j]))
        print("f1 is: " + str(f1_scores_rf[x]))
        print("recall is: " + str(recall_scores_rf[x]))
        print("precision is: " + str(precision_scores_rf[x]))
        x+=1

Max depth is : 10
Number of trees is: 10
f1 is: 0.40367956966482
recall is: 0.5050847457627119
precision is: 0.4345650959210281
Max depth is : 10
Number of trees is: 30
f1 is: 0.39030341075538816
recall is: 0.5050847457627119
precision is: 0.4744684424985912
Max depth is : 10
Number of trees is: 50
f1 is: 0.4061137059637736
recall is: 0.5152542372881356
precision is: 0.48513786635518097
Max depth is : 15
Number of trees is: 10
f1 is: 0.4115146623621199
recall is: 0.5016949152542373
precision is: 0.4049588377723971
Max depth is : 15
Number of trees is: 30
f1 is: 0.4608579635822515
recall is: 0.5525423728813559
precision is: 0.4766076834974385
Max depth is : 15
Number of trees is: 50
f1 is: 0.4724160035545253
recall is: 0.5627118644067797
precision is: 0.4927145575670404
Max depth is : 20
Number of trees is: 10
f1 is: 0.42483413700242567
recall is: 0.5050847457627119
precision is: 0.40579440879500384
Max depth is : 20
Number of trees is: 30
f1 is: 0.48555028248587573
recall is: 0.5728813

### Stemming

In [0]:
# RandomForestClassifier
rf = RandomForestClassifier(labelCol="Manual_Combined", featuresCol="combined_vectors_stem")

# Evaluate model using a MulticlassificationEvaluator
rf_evaluator = MulticlassClassificationEvaluator(labelCol="Manual_Combined", metricName="weightedFMeasure")

# Set the parameters to be passed to the model in the GridBuilder
params_rf_trees = [10, 30, 50]
params_rf_maxdepth = [10, 15, 20]

# Initialize empty lists to house the scores of the model with the given parameters
i=0
f1_scores_rf =[]
recall_scores_rf = []
precision_scores_rf = []

# This loops through the parameters and fits the model using the parameters specified above
for depth in params_rf_maxdepth:
    j=0
    #iterating through number of trees
    for trees in params_rf_trees:
        # Create ParamGrid for Cross Validation
        rfparamGrid = (ParamGridBuilder()
                       .addGrid(rf.numTrees, [trees]) \
        .addGrid(rf.maxDepth, [depth]) \
                       .build())


        # Create 5-fold CrossValidator
        rfcv = CrossValidator(estimator = rf,
                            estimatorParamMaps = rfparamGrid,
                            evaluator = rf_evaluator,
                            numFolds = 5)
        rfcvModel = rfcv.fit(training)

        prediction = rfcvModel.transform(testing).cache()

        #Evaluating F1 score
        evaluator_f1_rf = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedFMeasure")
        f1_score = evaluator_f1_rf.evaluate(prediction)
        f1_scores_rf.append(f1_score)
        
        #evaluating recall score
        evaluator_recall_rf = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedRecall")
        recall_score = evaluator_recall_rf.evaluate(prediction, {evaluator_recall_lr.metricName: "weightedRecall"})
        recall_scores_rf.append(recall_score)

        #evaluating precision score
        evaluator_precision_rf = MulticlassClassificationEvaluator(labelCol="Manual_Combined", predictionCol="prediction", metricName="weightedPrecision")
        precision_score = evaluator_precision_rf.evaluate(prediction, {evaluator_precision_lr.metricName: "weightedPrecision"})
        precision_scores_rf.append(precision_score)

In [0]:
x=0
for i in range(3):
    for j in range(3):
        print("Max depth is : " + str(params_rf_maxdepth[i]))
        print("Number of trees is: " + str(params_rf_trees[j]))
        print("f1 is: " + str(f1_scores_rf[x]))
        print("recall is: " + str(recall_scores_rf[x]))
        print("precision is: " + str(precision_scores_rf[x]))

        x+=1

Max depth is : 10
Number of trees is: 10
f1 is: 0.471549034488574
recall is: 0.5525423728813559
precision is: 0.5815962580577826
Max depth is : 10
Number of trees is: 30
f1 is: 0.4452514316835278
recall is: 0.5457627118644067
precision is: 0.5078586355206876
Max depth is : 10
Number of trees is: 50
f1 is: 0.4114602679039692
recall is: 0.5186440677966102
precision is: 0.4977845417137207
Max depth is : 15
Number of trees is: 10
f1 is: 0.4660184347404428
recall is: 0.5423728813559322
precision is: 0.5457977821973938
Max depth is : 15
Number of trees is: 30
f1 is: 0.5186582536609874
recall is: 0.6033898305084746
precision is: 0.5156549701978017
Max depth is : 15
Number of trees is: 50
f1 is: 0.4885728345662679
recall is: 0.576271186440678
precision is: 0.5159695524914691
Max depth is : 20
Number of trees is: 10
f1 is: 0.4875199787985015
recall is: 0.5593220338983051
precision is: 0.5557642278887844
Max depth is : 20
Number of trees is: 30
f1 is: 0.5339726499596422
recall is: 0.616949152542