In [4]:
import pandas as pd
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import re

#Downloading stopwords for data cleaning
#nltk.download('stopwords')

class data_cleaning:
    def __init__(self, file_path):
        #Initialize with the path to the CSV file
        self.file_path = file_path
        self.df_data = None
        self.stop_word = set(stopwords.words('english'))
        
    def load_data(self):
        #Loading data from CSV file
        self.df_data = pd.read_csv(self.file_path, encoding='latin-1', 
                                   names=['target', 'id', 'date', 'flag', 'user', 'text'])
        print("Data loaded successfully.")
        
    def clean_text(self, text):
        #Clean the text by removing URLs, mentions, special characters, and stopwords
        text = re.sub(r'http\S+', '', text)  #Removes URLs
        text = re.sub(r'@\w+', '', text)     #Removes mentions
        text = re.sub(r'[^A-Za-z\s]', '', text)  #Removes special characters
        text = text.lower()  #Converts to lowercase
        text = ' '.join(word for word in text.split() if word not in self.stop_word)  #Removes stopwords
        return text
    
    def clean_data(self):
        #Applying text cleaning to the dataset
        if self.df_data is not None:
            self.df_data['cleaned_text'] = self.df_data['text'].apply(self.clean_text)
            print("Text data cleaned.")
        else:
            print("Data not loaded yet. Please load the data first.")
    
    def replace_target_values(self):
        #Replacing target values: 4 -> 1 for positive sentiment
        if self.df_data is not None:
            self.df_data['target'] = self.df_data['target'].replace(4, 1)
            print("Target values replaced: 4 -> 1 for positive sentiment.")
        else:
            print("Data not loaded yet. Please load the data first.")
    
    def save_cleaned_data(self, output_file):
        #Saving the cleaned data to a new CSV file
        if self.df_data is not None:
            self.df_data.to_csv(output_file, index=False)
            print(f"Cleaned data saved to {output_file}.")
        else:
            print("No data to save. Please clean the data first.")
    
    def sentiment_distribution(self):
        #Checking sentiment distribution
        if self.df_data is not None:
            print(f"Total Tweets: {len(self.df_data)}")
            print(self.df_data['target'].value_counts())
        else:
            print("Data not loaded yet. Please load the data first.")

#Main execution block
if __name__ == "__main__":
    file_path = 'C:\\Users\\sneh_\\Downloads\\archive (8)\\training.1600000.processed.noemoticon.csv'
    output_file = 'D:/Big Data Analytics/Term-2/BDM 1034 - Application Design for Big Data 01/Project_sentiment/training_data/cleaned_data.csv'
    
    #Calling data_cleaning class
    data = data_cleaning(file_path)
    
    #Loading the dataset
    data.load_data()
    
    #Cleaning the text data
    data.clean_data()
    
    #Replacing target values (4 -> 1)
    data.replace_target_values()
    
    #Checking sentiment dirtibution
    data.sentiment_distribution()
    
    #Saving the cleaned data to a new file
    data.save_cleaned_data(output_file)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\sneh_/nltk_data'
    - 'C:\\Users\\sneh_\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data'
    - 'C:\\Users\\sneh_\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data'
    - 'C:\\Users\\sneh_\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\sneh_\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [5]:
# Analyzing distribution of date column (assuming it’s in column 'C')
df['datetime'] = pd.to_datetime(df[df.columns[2]], errors='coerce')
df['date'] = df['datetime'].dt.date
df['hour'] = df['datetime'].dt.hour

# Plotting number of posts per date
plt.figure(figsize=(10, 5))
df['date'].value_counts().sort_index().plot(kind='line')
plt.title('Number of Posts per Date')
plt.xlabel('Date')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()



NameError: name 'df' is not defined

In [None]:
# Analyzing categorical column (Column D)
plt.figure(figsize=(8, 4))
sns.countplot(x=df[df.columns[3]])
plt.title('Distribution of Query Types')
plt.xticks(rotation=45)
plt.show()




In [None]:
# Analyze Text Length Distribution
df['text_length'] = df[df.columns[5]].astype(str).apply(len)
plt.figure(figsize=(8, 4))
sns.histplot(df['text_length'], bins=30, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Analyzing most active users (Column E)
top_users = df[df.columns[4]].value_counts().head(10)
plt.figure(figsize=(10, 5))
sns.barplot(x=top_users.index, y=top_users.values)
plt.title('Top 10 Most Active Users')
plt.xticks(rotation=45)
plt.ylabel('Number of Posts')
plt.show()




In [None]:
# WordCloud for Text Data (Column F)
text_data = ' '.join(df[df.columns[5]].dropna().astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud of Text Content')
plt.show()