### Import the Data into MongoDB (Bronze Layer)

In [15]:
import pandas as pd
from pymongo import MongoClient

# Load the CSV file
data = pd.read_csv("Brazilian_Portuguese_Sentiment.csv")

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client["bigdata_project"]
collection = db["Brazilian_Portuguese_Sentiment"]

# Convert DataFrame to dictionary and insert into MongoDB
data_dict = data.to_dict("records")
collection.insert_many(data_dict)

# Check number of rows and columns
print(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")

Rows: 100679, Columns: 8


### Data Cleaning (Silver Layer)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100679 entries, 0 to 100678
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   original_index         100679 non-null  int64  
 1   review_text            100679 non-null  object 
 2   review_text_processed  100679 non-null  object 
 3   review_text_tokenized  100679 non-null  object 
 4   polarity               88238 non-null   float64
 5   rating                 100679 non-null  int64  
 6   kfold_polarity         100679 non-null  int64  
 7   kfold_rating           100679 non-null  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 6.1+ MB


In [17]:
#Before Data cleaning
data.isnull().sum()

original_index               0
review_text                  0
review_text_processed        0
review_text_tokenized        0
polarity                 12441
rating                       0
kfold_polarity               0
kfold_rating                 0
dtype: int64

In [32]:
# Check for missing values
import warnings
warnings.filterwarnings('ignore')
data.isnull().sum()
data = data.drop_duplicates()
# Handle missing values (if any)
data.fillna(method='ffill', inplace=True)

In [19]:
#After Data Cleanig
data.isnull().sum()

original_index           0
review_text              0
review_text_processed    0
review_text_tokenized    0
polarity                 0
rating                   0
kfold_polarity           0
kfold_rating             0
dtype: int64

### Aggregation (Golden Layer)

In [28]:
# Distribution of ratings
rating_counts = data['rating'].value_counts().reset_index()
rating_counts.columns = ['Rating', 'Review_Count']
print("Distribution of Ratings:")
print(rating_counts.head()) 
# Count of reviews by rating
rating_counts = data['rating'].value_counts().reset_index()
rating_counts.columns = ['Rating', 'Review_Count']
print("\nCount of Reviews by Rating:")
print(rating_counts.head()) 
# Average polarity by rating
avg_polarity_rating = data.groupby('rating')['polarity'].mean().reset_index()
avg_polarity_rating.columns = ['Rating', 'Avg_Polarity']
print(avg_polarity_rating.head())