In [7]:
import pandas as pd
import re
import emoji

# Function to clean the text data in the dataset
def clean_text(text):
    # Convert emojis to text
    text = emoji.demojize(text)
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\S+', '', text)     # Remove Twitter handles
    text = re.sub(r'#\S+', '', text)     # Remove hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text).lower()  # Remove special characters and convert to lowercase
    return text

# Load the dataset
file_path = 'sentiment_analysis_contest_test_file.csv'
df = pd.read_csv(file_path, engine='python')

# Apply the cleaning function to the 'text' column
df['text'] = df['text'].apply(clean_text)

# show the dataframe
df


Unnamed: 0,index,text
0,8000,and praised and they thought they did a goo...
1,8001,more from and his insight with some unknow...
2,8002,rt has any candidate received a word from god...
3,8003,rt trump has cam hands
4,8004,watched the from last night and damn trump we...
...,...,...
2724,10724,i want to rt all of tweets there is nothing ...
2725,10725,rt this is real life these people are running...
2726,10726,rt john podhoretz on says emerges as the pa...
2727,10727,it wasnt a debateit was an effort to bring d...


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import numpy as np

# Simulating labels (replace this with actual labels in your dataset)
np.random.seed(42)  # For reproducibility
sampled_df = df.sample(n=2729)  # Taking a small sample
simulated_labels = np.random.choice(['positive', 'negative', 'neutral'], size=2729)  # Randomly assigning sentiments

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sampled_df['text'], simulated_labels, test_size=0.25, random_state=42)

# Training a Naive Bayes classifier
model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

# Evaluating the model
predicted = model.predict(X_test)
report = classification_report(y_test, predicted)

# output report
print(report)

              precision    recall  f1-score   support

    negative       0.32      0.27      0.29       235
     neutral       0.32      0.36      0.34       216
    positive       0.31      0.32      0.31       232

    accuracy                           0.31       683
   macro avg       0.31      0.31      0.31       683
weighted avg       0.31      0.31      0.31       683



In [13]:
import pandas as pd
import numpy as np
import re
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Function to clean the text data
def clean_text(text):
    text = emoji.demojize(text)  # Convert emojis to text
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\S+', '', text)     # Remove Twitter handles
    text = re.sub(r'[^A-Za-z\s]', '', text).lower()  # Remove special characters and convert to lowercase
    return text

# Function to load and preprocess the training data
def load_and_preprocess_training_data(file_path):
    data = {'ID': [], 'Sentiment': [], 'Text': []}
    with open(file_path, 'r', encoding='utf-8') as file:
        next(file)  # Skip the header line
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                data['ID'].append(parts[0])
                data['Sentiment'].append('positive' if parts[1] == '+' else 'negative')
                data['Text'].append(clean_text(parts[2]))
    return pd.DataFrame(data)

# Load and preprocess the training data
training_file_path = 'Training.txt'  # Replace with the path to your training file
training_df = load_and_preprocess_training_data(training_file_path)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(training_df['Text'], training_df['Sentiment'], test_size=0.2, random_state=42)

# Creating a pipeline with CountVectorizer and MultinomialNB for text classification
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Training the model
model.fit(X_train, y_train)

# Evaluating the model on the test set
predicted = model.predict(X_test)
report = classification_report(y_test, predicted)
print(report)

# Load and preprocess the test data (the .csv file you provided earlier)
test_file_path = 'training.csv'  # Replace with the path to your test file
test_df = pd.read_csv(test_file_path, engine='python')
test_df['text'] = test_df['text'].apply(clean_text)

# Making predictions on the test data
test_predictions = model.predict(test_df['text'])

# Output the predictions (you may want to further analyze or evaluate these predictions)
print(test_predictions)


              precision    recall  f1-score   support

    negative       0.77      0.80      0.78     99930
    positive       0.79      0.76      0.77    100046

    accuracy                           0.78    199976
   macro avg       0.78      0.78      0.78    199976
weighted avg       0.78      0.78      0.78    199976

['positive' 'positive' 'positive' ... 'positive' 'negative' 'negative']
