Name: Shreeya Chitnis 
Library: NLTK
URL: https://github.com/shreeyachitnis/NLTK
Description: NLTK, short for Natural Language Toolkit, is a comprehensive Python library designed to facilitate the exploration, processing, and analysis of human language data.



In [None]:
import string

import matplotlib.pyplot as plt
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


In [None]:
# Load the dataset of customer reviews
df = pd.read_csv('data/Womens Clothing E-Commerce Reviews.csv')

In [None]:
# Drop rows with missing values in the 'Review Text' column
df.dropna(subset=['Review Text'], inplace=True)

In [None]:
# Reduce the length of the dataset as it takes a lot of time to process 
df = df.iloc[:len(df)//2]

In [None]:
# Preprocess the text data
def preprocess_text(text):
    # Tokenize the text and lowercase
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    
    return tokens

In [None]:
# Apply preprocessing to the 'review' column
df['tokens'] = df['Review Text'].apply(preprocess_text)

In [None]:
sid = SentimentIntensityAnalyzer()
# Perform sentiment analysis using NLTK's VADER
df['sentiment_score'] = df['tokens'].apply(lambda tokens: sid.polarity_scores(' '.join(tokens))['compound'])



In [None]:
# Plot the distribution for Sentiment Scores  
plt.figure(figsize=(8, 6))
plt.hist(df['sentiment_score'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Extract key insights from the text data
def extract_insights(tokens):
    # Calculate word frequencies
    fdist = FreqDist(tokens)
    
    # Get the most common words
    most_common_words = fdist.most_common(10)
    
    return most_common_words

In [None]:
# Apply insights extraction to the 'tokens' column
df['insights'] = df['tokens'].apply(extract_insights)

In [None]:
insights_list = []
# Iterate through the DataFrame and collect insights
for idx, row in df.iterrows():
    review_number = idx + 1
    insights = row['insights']
    insights_list.append({'Review': review_number, 'Insights': insights})

# Create a new DataFrame from the list
insights_df = pd.DataFrame(insights_list)

In [None]:
insights_df.head(10)

In [None]:
#Example 2

In [None]:
# Load the dataset of movie reviews
df = pd.read_csv('data/movie.csv')

In [None]:
df.head(2)

In [None]:
# Reduce the length of the dataset as it takes a lot of time to process 
df = df.iloc[:len(df)//8]

In [None]:
# Tokenization, Lowercasing, Removing Stopwords and Punctuation
stop_words = set(stopwords.words('english'))

df['preprocessed_text'] = df['text'].apply(lambda text: ' '.join([word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stop_words]))



In [None]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Feature Extraction: TF-IDF Vectorization

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
# Train a classifier (e.g., Naive Bayes)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

In [None]:
# Evaluate the classifier
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))