In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import nltk

In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:


# Load your data
df = pd.read_csv("EcoPreprocessed.csv")
df = df.drop(["Unnamed: 0", "polarity"], axis=1)
df.rename(columns={'division': 'label', 'review': 'sentence'}, inplace=True)


In [None]:
# Convert labels to numeric values
label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
df['label'] = df['label'].map(label_mapping)

# Function to preprocess a text string
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    return text

In [None]:
# Apply preprocessing to the 'sentence' column
df['sentence'] = df['sentence'].apply(preprocess_text)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(df['sentence'])

# Convert the TF-IDF features to a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine the TF-IDF features DataFrame with the 'label' column
final_df = pd.concat([tfidf_df, df['label']], axis=1)


In [None]:
# Split the data into training and testing sets
x = final_df.drop('label', axis=1)
y = final_df['label']
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [None]:
# Random Forest model
model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0)
model.fit(X_train, Y_train)

In [None]:
# Evaluate the model
print('Random Forest Training Accuracy:', model.score(X_train, Y_train))
y_pred = model.predict(X_test)
print('Accuracy on Test Set:', accuracy_score(Y_test, y_pred))
