In [2]:
# import libraries
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

nltk.download('all')
nltk.download('stopwords')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/maryellenschuster/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/maryellenschuster/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/maryellenschuster/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_

True

In [3]:
#read in data 
barbie_data = pd.read_csv("barbie_Cleaned.csv")
barbie_data = barbie_data[barbie_data['rating'] != '1']
barbie_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 533 entries, 0 to 795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    533 non-null    object
 1   rating  533 non-null    object
dtypes: object(2)
memory usage: 12.5+ KB


In [4]:
#clean text
#separate conjoined words using wordninja 

import wordninja
barbie_data['text'] = barbie_data['text'].apply(lambda x: ' '.join(wordninja.split(x)))
barbie_data

Unnamed: 0,text,rating
0,Beautiful film but so preachy Love of Legacy 2...,6
1,Clever and fun beginning turns into another le...,6
2,It was depressing cox a nees a 24 July 2023 Wa...,8
3,A Technicolor Dream heather hil gers 24 July 2...,9
4,reasons FOR seeing it and 1 reason AGAINST im ...,7
...,...,...
791,Waste of times an as hak ib 23 July 2023 If ur...,2
792,Barbie not good thong man 22 July 2023 The onl...,2
793,Feminism but the rest iy as emin kos e 23 July...,5
794,Great sets great execution but horrible storyl...,3


In [5]:
#create date column
barbie_data = pd.DataFrame(barbie_data)

# Extract date using regular expression
date_pattern = r'(\d{1,2})\s(July)\s(\d{4})'
barbie_data['full_date'] = barbie_data['text'].str.extract(date_pattern).apply(lambda x: ' '.join(x), axis=1)

# Display DataFrame with new column
print(barbie_data['full_date'])

0      21 July 2023
1      21 July 2023
2      24 July 2023
3      24 July 2023
4      22 July 2023
           ...     
791    23 July 2023
792    22 July 2023
793    23 July 2023
794    28 July 2023
795    27 July 2023
Name: full_date, Length: 533, dtype: object


In [6]:
#remove non-english words

with open('words.txt', 'r') as file:
    english_words = set(word.strip().lower() for word in file)

# Define a function to remove non-English words
def remove_non_english_words(text):
    words = text.split()
    valid_words = [word for word in words if word.lower() in english_words]
    return ' '.join(valid_words)

# Apply the function to your DataFrame
barbie_data['text'] = barbie_data['text'].apply(remove_non_english_words)

barbie_data

Unnamed: 0,text,rating,full_date
0,Beautiful film but so preachy Love of Legacy J...,6,21 July 2023
1,Clever and fun beginning turns into another le...,6,21 July 2023
2,It was depressing cox a a July Warning Spoiler...,8,24 July 2023
3,A Technicolor Dream heather gers July Wow this...,9,24 July 2023
4,reasons FOR seeing it and reason AGAINST im se...,7,22 July 2023
...,...,...,...
791,Waste of times an as hak ib July If ur over th...,2,23 July 2023
792,Barbie not good thong man July The only reason...,2,22 July 2023
793,Feminism but the rest iy as kos e July I'm NOT...,5,23 July 2023
794,Great sets great execution but horrible storyl...,3,28 July 2023


In [7]:
#remove numbers
import re
barbie_data['text'] = barbie_data['text'].apply(lambda x: re.sub(r'\d+', '', x) if isinstance(x, str) else x)

In [8]:
#remove phrases
phrases_to_remove = ['found', 'helpful', 'review', 'Sign','vote', 'Permalink', 'Warning', 'Spoilers', 'July']
barbie_data['text'] = barbie_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in phrases_to_remove]) if isinstance(x, str) else x)


In [9]:
#remove punctuation 
import string
barbie_data['text'] = barbie_data['text'].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)) if isinstance(x, str) else x)

In [10]:
#remove letters that aren't in words
def remove_non_word_letters(text):
    words = wordninja.split(text)
    valid_words = [word for word in words if word.isalpha()]
    return ' '.join(valid_words)

# Apply the function to the 'text' column
barbie_data['text'] = barbie_data['text'].apply(remove_non_word_letters)
barbie_data

Unnamed: 0,text,rating,full_date
0,Beautiful film but so preachy Love of Legacy M...,6,21 July 2023
1,Clever and fun beginning turns into another le...,6,21 July 2023
2,It was depressing cox a a out of this Was this...,8,24 July 2023
3,A Technicolor Dream heather gers Wow this movi...,9,24 July 2023
4,reasons FOR seeing it and reason AGAINST im se...,7,22 July 2023
...,...,...,...
791,Waste of times an as hak ib If ur over this mo...,2,23 July 2023
792,Barbie not good thong man The only reason this...,2,22 July 2023
793,Feminism but the rest iy as kos e Im NOT a Bar...,5,23 July 2023
794,Great sets great execution but horrible storyl...,3,28 July 2023


In [11]:
#convert rating to numeric
barbie_data['rating'] = pd.to_numeric(barbie_data['rating'], errors = 'coerce')

#drop Nas
barbie_data = barbie_data.dropna(subset = ['rating'])

In [12]:
#convert barbie_data to csv
barbie_data.to_csv('barbie_data.csv', index=False)