In [None]:
# import libraries
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

nltk.download('all')
nltk.download('stopwords')

In [None]:
#read in data 
barbie_data = pd.read_csv("barbie_Cleaned.csv")
barbie_data = barbie_data[barbie_data['rating'] != '1']
barbie_data.info()

In [None]:
#clean text
#separate conjoined words using wordninja 

import wordninja
barbie_data['text'] = barbie_data['text'].apply(lambda x: ' '.join(wordninja.split(x)))
barbie_data

In [None]:
#create date column
barbie_data = pd.DataFrame(barbie_data)

# Extract date using regular expression
date_pattern = r'(\d{1,2})\s(July)\s(\d{4})'
barbie_data['full_date'] = barbie_data['text'].str.extract(date_pattern).apply(lambda x: ' '.join(x), axis=1)

# Display DataFrame with new column
print(barbie_data['full_date'])

In [None]:
#remove non-english words

with open('words.txt', 'r') as file:
    english_words = set(word.strip().lower() for word in file)

# Define a function to remove non-English words
def remove_non_english_words(text):
    words = text.split()
    valid_words = [word for word in words if word.lower() in english_words]
    return ' '.join(valid_words)

# Apply the function to your DataFrame
barbie_data['text'] = barbie_data['text'].apply(remove_non_english_words)


In [None]:
#remove numbers
import re
barbie_data['text'] = barbie_data['text'].apply(lambda x: re.sub(r'\d+', '', x) if isinstance(x, str) else x)

In [None]:
#remove phrases
phrases_to_remove = ['found', 'helpful', 'review', 'Sign','vote', 'Permalink', 'Warning', 'Spoilers', 'July']
barbie_data['text'] = barbie_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in phrases_to_remove]) if isinstance(x, str) else x)

In [None]:
#remove punctuation 
import string
barbie_data['text'] = barbie_data['text'].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)) if isinstance(x, str) else x)

In [None]:
#remove letters that aren't in words
def remove_non_word_letters(text):
    words = wordninja.split(text)
    valid_words = [word for word in words if word.isalpha()]
    return ' '.join(valid_words)

# Apply the function to the 'text' column
barbie_data['text'] = barbie_data['text'].apply(remove_non_word_letters)
barbie_data

In [None]:
#convert rating to numeric
barbie_data['rating'] = pd.to_numeric(barbie_data['rating'], errors = 'coerce')

#drop Nas
barbie_data = barbie_data.dropna(subset = ['rating'])

In [None]:
#convert barbie_data to csv
barbie_data.to_csv('barbie_data.csv', index=False)

In [None]:
##distribution of review lengths by ratings 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.DataFrame(barbie_data)

#compute length of each review
df['review_length'] = df['text'].apply(len)

# establish color for each rating 
colors = {1: 'darksalmon', 2: 'firebrick', 3: 'darkorange', 4: 'gold', 5: 'lemonchiffon', 6: 'darkseagreen', 7:'seagreen', 8: 'lightblue', 9: 'steelblue'}


plt.figure(figsize=(10, 6))
#create seperate histogram for each rating subsetted by review length  
for rating in sorted(df['rating'].unique()):
    subset = df[df['rating'] == rating]
    plt.hist(subset['review_length'], bins=20, alpha=0.5, label=f'Rating {rating}', color=colors[rating])

plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.title('Distribution of Review Lengths by Ratings')
plt.legend()
plt.grid(True)
plt.xscale('log')
plt.show()

In [None]:
## Distribution of review lengths
ratings_data = barbie_data['rating']

# Create a pandas DataFrame
df = pd.DataFrame({'ratings': ratings_data})

# Create a histogram using Seaborn
sns.set(style="whitegrid")  # Set the style
sns.histplot(df['ratings'], kde=True, bins=10, color='skyblue')  # Create the distribution plot
plt.title('Distribution Plot of Ratings')  # Add title
plt.xlabel('Ratings')  # Add x-axis label
plt.ylabel('Frequency')  # Add y-axis label
plt.show()

In [None]:
##Plot top 40 most used adjectives in reviews 

import spacy
from collections import Counter
import matplotlib.pyplot as plt

# Load the English language model
nlp = spacy.load("en_core_web_sm")

reviews = barbie_data['text']

#remove adjectives that aren't helpful for analysis
words_to_remove = ['other', 'sure', 'first', 'many', 'same', 'own', 'little', 'most', 'few', 'main', 'such']

# Function to extract adjectives from a spacy parsed document
def extract_adjectives(doc):
    adjectives = [token.text for token in doc if token.pos_ == 'ADJ' and token.text.lower() not in words_to_remove]
    return adjectives

# Tokenize each review, parse them using spacy, and extract adjectives
adjectives = []
for review in reviews:
    doc = nlp(review)
    adjectives.extend(extract_adjectives(doc))

# Count the frequency of each adjective
adjective_freq = Counter(adjectives)

top_40_adjectives = dict(adjective_freq.most_common(40))

# Plotting the frequency distribution for the top 20 adjectives
plt.figure(figsize=(10, 6))
plt.bar(top_40_adjectives.keys(), top_40_adjectives.values(), color='skyblue')
plt.xlabel('Adjectives')
plt.ylabel('Frequency')
plt.title('Top 40 Adjectives in Reviews')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
#get vader scores 
import nltk
nltk.data.path.append('/Users/maryellenschuster/nltk_data')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

barbie_data['vader_scores'] = barbie_data['text'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x)['compound'])
barbie_data

In [None]:
#convert vader scores to numeric 
barbie_data['vader_scores'] = pd.to_numeric(barbie_data['vader_scores'])
barbie_data['rating'] = pd.to_numeric(barbie_data['rating'])

In [None]:
#Plot sentiment scores overtime

df = pd.DataFrame(barbie_data)
df['full_date'] = pd.to_datetime(df['full_date'])  # Convert date column to datetime format

# Plot sentiment scores over time
plt.figure(figsize=(10, 6))
plt.plot(df['full_date'], df['vader_scores'], marker='o', color='b', linestyle='-')

plt.title('Sentiment Scores Over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#distribution of sentiment scores 
import matplotlib.pyplot as plt

sentiment_scores = barbie_data['vader_scores']

# Plotting the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(sentiment_scores, bins=10, color='skyblue', edgecolor='black')  # Adjust the number of bins as needed
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.5)
plt.show()

In [None]:
# Check if there are any missing values and handle them if necessary
barbie_data.dropna(inplace=True)

# Create a scatter plot to visualize the relationship
plt.figure(figsize=(8, 6))
sns.scatterplot(x='vader_scores', y='rating', data=barbie_data)
plt.title('Correlation between VADER Sentiment Scores and Ratings')
plt.xlabel('VADER Sentiment Score')
plt.ylabel('Rating')
plt.grid(True)

# Calculate the correlation coefficient
correlation_coefficient = barbie_data['vader_scores'].corr(barbie_data['rating'])
print(f"Correlation Coefficient: {correlation_coefficient}")

plt.show()

In [None]:

X = barbie_data['vader_scores']
y = barbie_data['rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

In [None]:
#linear regression model 
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#calculate predicted values
predictions = model.predict(X_test)

In [None]:
#calculate evaluation statistics
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

barbie_data.head(20)

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Define a function to map sentiment scores to predicted ratings
def map_vader_to_rating(vader_score):
    if vader_score >= 0.5:
        return 5  # High positive sentiment
    elif vader_score >= 0:
        return 4  # Low positive sentiment
    elif vader_score >= -0.5:
        return 3  # Low negative sentiment
    else:
        return 2  # High negative sentiment

# Apply the function to create the new column
barbie_data['predicted_ratings'] = barbie_data['vader_scores'].apply(map_vader_to_rating)

print(barbie_data)