In [73]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [75]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mvish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [77]:
# Load the dataset
file_path = r"C:\Users\mvish\Desktop\clubs\data.csv"  # Replace with the actual file path
df = pd.read_csv(file_path)


In [79]:

# Split the reviews into individual reviews
df['REVIEWS'] = df['REVIEWS'].apply(lambda x: x.split(','))
df = df.explode('REVIEWS')


In [83]:

# Clean the text
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text


df['Cleaned_Reviews'] = df['REVIEWS'].apply(clean_text)


In [17]:
!pip install stopwords

Collecting stopwords
  Downloading stopwords-1.0.1-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading stopwords-1.0.1-py2.py3-none-any.whl (37 kB)
Installing collected packages: stopwords
Successfully installed stopwords-1.0.1


In [93]:
from sklearn.utils import resample
# Upsample minority classes
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)
# Combine and shuffle
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [91]:
# Check the distribution of the categories
category_counts = df['Key_Theme'].value_counts()

# Define the majority and minority classes
majority_class = category_counts.idxmax()
minority_classes = category_counts[category_counts < category_counts.max()].index.tolist()

# Split the dataframe into majority and minority classes
df_majority = df[df['Key_Theme'] == majority_class]

# Combine all minority classes into one dataframe
df_minority = df[df['Key_Theme'].isin(minority_classes)]

In [33]:


# Define a simple function to categorize reviews into key themes/topics
def categorize_review(review):
    if any(keyword in review for keyword in ['engage', 'activity', 'event']):
        return 'Engagement'
    elif any(keyword in review for keyword in ['learn', 'workshop', 'education']):
        return 'Learning Opportunities'
    elif any(keyword in review for keyword in ['support', 'helpful', 'community']):
        return 'Support'
    elif any(keyword in review for keyword in ['hands-on', 'practical', 'experience']):
        return 'Practical Application'
    elif any(keyword in review for keyword in ['diverse', 'variety', 'topics']):
        return 'Diversity of Initiatives'
    else:
        return 'Other'


In [99]:

# Apply the categorization to the dataset
df['Key_Theme'] = df['Cleaned_Reviews'].apply(categorize_review)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['Cleaned_Reviews'])

# Prepare the labels (Key Themes/Topics)
y = df['Key_Theme']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

# To see how the model categorizes new reviews
new_reviews = ["Great workshop for learning new skills", "Needs more engaging activities", "Supportive community for learners"]
new_reviews_cleaned = [clean_text(review) for review in new_reviews]
new_reviews_tfidf = vectorizer.transform(new_reviews_cleaned)
predictions = model.predict(new_reviews_tfidf)

for review, theme in zip(new_reviews, predictions):
    print(f"Review: {review} \nPredicted Theme: {theme}\n")

                          precision    recall  f1-score   support

Diversity of Initiatives       1.00      1.00      1.00         3
              Engagement       0.00      0.00      0.00         2
  Learning Opportunities       0.00      0.00      0.00         3
                   Other       0.76      1.00      0.86        16
                 Support       1.00      1.00      1.00         4

                accuracy                           0.82        28
               macro avg       0.55      0.60      0.57        28
            weighted avg       0.69      0.82      0.74        28

Review: Great workshop for learning new skills 
Predicted Theme: Other

Review: Needs more engaging activities 
Predicted Theme: Other

Review: Supportive community for learners 
Predicted Theme: Support



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
print(df['Key_Theme'].value_counts())

Key_Theme
Other                       86
Support                     18
Diversity of Initiatives    14
Learning Opportunities      13
Engagement                   4
Practical Application        2
Name: count, dtype: int64


In [95]:
from sklearn.utils import resample
# Upsample minority classes
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)
# Combine and shuffle
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [55]:
!pip install scikit-learn



In [97]:
 from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
 model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)