# GIK2KM - Lab 4 - Text Mining

Run the code below to download language models etc.

In [None]:
import os
import spacy
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Try to load the module
try:
    nlp = spacy.load("en_core_web_sm")
    print("Module 'en_core_web_sm' is loaded.")
except OSError:
    print("Module 'en_core_web_sm' is not installed. Installing now...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    print("If the module still couldn't be loaded, but the installation was succesful, try to restart the kernel.")

## Task1: Sentiment Analysis using polarity

In [None]:
import pandas as pd

# Read data to dataframe
df_t1 = pd.read_csv('amazon_alexa.tsv', sep='\t')

# Convert reviews to strings
df_t1['reviews'] = df_t1['verified_reviews'].astype(str)

df_t1.head()

In [None]:
import contractions

# Remove contractions function
def remove_contractions(text):
    return contractions.fix(text)

# Remove contractions
df_t1['reviews'] = df_t1['reviews'].apply(remove_contractions)

df_t1

In [None]:
import re

# Remove special characters function
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]+', '', text)

# Remove special characters
df_t1['reviews'] = df_t1['reviews'].apply(remove_special_characters)

df_t1

In [None]:
from nltk.tokenize import word_tokenize

# Tokenize function
def tokenize(text):
    return word_tokenize(text)

# Tokenize
df_t1['reviews_tokenized'] = df_t1['reviews'].apply(tokenize)

df_t1

In [None]:
# Lowercase function
def lowercase(tokens):
    return [word.lower() for word in tokens]

# Lowercase
df_t1['reviews_tokenized'] = df_t1['reviews_tokenized'].apply(lowercase)

df_t1

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Remove stopwords function
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Remove stopwords
df_t1['reviews_tokenized'] = df_t1['reviews_tokenized'].apply(remove_stopwords)

df_t1

In [None]:
from nltk.stem import WordNetLemmatizer

# Create a WordNet lemmatizer object
lemmatizer = WordNetLemmatizer()

# Lemmatize function
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Lemmatize
df_t1['reviews_tokenized'] = df_t1['reviews_tokenized'].apply(lemmatize)

df_t1

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

# Join the tokens back into strings
df_t1['reviews_cleaned'] = df_t1['reviews_tokenized'].apply(lambda tokens: ' '.join(tokens))

# Create the CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(df_t1['reviews_cleaned'])

# Count the sum of each word
sum_words = X.sum(axis=0)

# Create a (word, frequency) list and sort it in descending order
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

# Get the 20 most common words
common_words = words_freq[:20]

# Separate words and counts
words, counts = zip(*common_words)

# Plot word frequencies
plt.figure(figsize=(10, 5))
plt.bar(words, counts)
plt.title('20 Most Common Words in Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
from textblob import TextBlob

# Calculate the polarity
def get_polarity(text):
    textblob = TextBlob(str(text))
    pol = textblob.sentiment.polarity # type: ignore
    if(pol==0):
        return "Neutral"
    elif(pol>0.1 and pol<=1):
        return "Positive"
    elif(pol>-1 and pol<=-0.1):
        return "Negative"
    
df_t1['polarity'] = df_t1['reviews_cleaned'].apply(get_polarity) # type: ignore

In [None]:
print(df_t1['polarity'].value_counts())
df_t1['polarity'].value_counts().plot(kind='pie', autopct='%1.0f%%', colors=["green", "yellow", "red"])

### Notes

* Best results was achieved when removing stopwords before lemmatizing the reviews. If lemmatizing first lots of junk words like "wa" ended up in the top list due to the lemmatizer converting "was" to "wa".

* Removal of contractions was also needed because otherwise words like isn't was split into "is" and "n't" and thus "n't" which isn't in the stop word dictionary wasn't cleaned.

## Task2: Text classification using Randomforest

In [None]:
import pandas as pd

# Read data to dataframe
df_t2 = pd.read_csv('moviereviews.tsv', sep='\t')

#Print the number of null values
print(df_t2.isnull().sum())

# Drop NaN values
df_t2.dropna(inplace=True)

In [None]:
import contractions

# Remove contractions function
def remove_contractions(text):
    return contractions.fix(text)

# Remove contractions
df_t2['review'] = df_t2['review'].apply(remove_contractions)

df_t2

In [None]:
import re

# Remove special characters function
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]+', '', text)

# Remove special characters
df_t2['review'] = df_t2['review'].apply(remove_special_characters)

df_t2

In [None]:
from nltk.tokenize import word_tokenize

# Tokenize function
def tokenize(text):
    return word_tokenize(text)

# Tokenize
df_t2['review_tokenized'] = df_t2['review'].apply(tokenize)

df_t2

In [None]:
# Lowercase function
def lowercase(tokens):
    return [word.lower() for word in tokens]

# Lowercase
df_t2['review_tokenized'] = df_t2['review_tokenized'].apply(lowercase)

df_t2

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Remove stopwords function
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Remove stopwords
df_t2['review_tokenized'] = df_t2['review_tokenized'].apply(remove_stopwords)

df_t2.head()

In [None]:
from nltk.stem import WordNetLemmatizer

# Create a WordNet lemmatizer object
lemmatizer = WordNetLemmatizer()

# Lemmatize function
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Lemmatize
df_t2['review_tokenized'] = df_t2['review_tokenized'].apply(lemmatize)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

# Join the tokens back into strings
df_t2['review_cleaned'] = df_t2['review_tokenized'].apply(lambda tokens: ' '.join(tokens))

# Create the CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(df_t2['review_cleaned'])

# Count the sum of each word
sum_words = X.sum(axis=0)

# Create a (word, frequency) list and sort it in descending order
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

# Get the 20 most common words
common_words = words_freq[:20]

# Separate words and counts
words, counts = zip(*common_words)

# Plot word frequencies
plt.figure(figsize=(10, 5))
plt.bar(words, counts)
plt.title('20 Most Common Words in Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing set (70 : 30)
X_train, X_test, y_train, y_test = train_test_split(df_t2['review_cleaned'], df_t2['label'], test_size=0.3)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

text_clf_forest = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

# Fit the model
text_clf_forest.fit(X_train, y_train)

# Predict the test data
y_pred = text_clf_forest.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.3f}'.format(accuracy))