<center>
    <font color = '#00FFFF'>
        <h1>Tweets Classification</h1>
    </font>
    <font color = '#FFA500'>
        <h3>Natural Language Processing</h3>
    </font>
</center>

# Loading Dependencies

In [39]:
#region Downloading Dataset

# Operating Systemm.
from os import path, environ, listdir, remove

# JSON.
from json import load

# ZIP-File.
from zipfile import ZipFile

# Input-Output.
from io import BytesIO

# Requests.
from requests import get, Session

# Kaggle API.
from kaggle.api.kaggle_api_extended import KaggleApi

#endregion

#region Data Manipulation

# Pandas.
from pandas import read_csv, DataFrame, Series

# Numpy.
from numpy import NAN

# Randomization.
from random import random

# String.
from string import ascii_letters

#endregion

# Regular Expression.
from re import findall

#region Visualization

# Plotly-Express.
from plotly.express import scatter

# Seaborn.
from seaborn import heatmap

# Matplot-Library.
from matplotlib.pyplot import show

# Word-Cloud.
from wordcloud import WordCloud

#endregion

#region Natural Language Processing

#region Natural Language Toolkit

# Step-Words.
from nltk.corpus import stopwords

# Tokenization.
from nltk.tokenize import RegexpTokenizer, TweetTokenizer

# Stemming.
from nltk.stem import WordNetLemmatizer, PorterStemmer

#endregion

#endregion

#region Machine Learning Model

#region Torch

#endregion

#endregion

# Data Exploration

## Fetching Dataset

In [45]:
# Loading the dataset from the Kaggle platform using URL -> https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis/data.

def fetch_dataset() -> DataFrame | None:

    #region Configurations

    # File-Name for storing the credentials.
    api_file_path = 'Kaggle.JSON'
    
    # URL for dataset on Kaggle.
    dataset_url = 'jp797498e/twitter-entity-sentiment-analysis'
    
    # Dataset File Name.
    train_dataset_file_path = 'twitter_training.csv'
    
    #endregion

    # Proceed when the file exists.
    if path.exists(path = api_file_path):
        
        #region Loading the credentials from the file.
        
        # Loading the file for credentials.
        with open(file = api_file_path, mode = 'r') as json_file:

            # Store the file content in JSON format.
            kaggle_api_credentials = load(json_file)
        
        #endregion
        
        #region Storing Credentials in OS Environment
        
        # Storing User-Name.
        environ['KAGGLE_USERNAME'] = kaggle_api_credentials['username']
        
        # Storing API-Key.
        environ['KAGGLE_KEY'] = kaggle_api_credentials['key']
        
        #endregion
    
        #region Fetching Data from Kaggle.

        # Creating an instance for interacting with Kaggle.
        kaggle_api = KaggleApi()
        
        # Authenticate using credentials downloaded.
        kaggle_api.authenticate()
        
        # Downloading the Dataset Files.
        kaggle_api.dataset_download_files(dataset = dataset_url, unzip = True)
        
        #endregion

        # Reading the dataset once downloaded in the local system.
        dataset = read_csv(filepath_or_buffer = train_dataset_file_path)
        
        #region Data Cleaning
        
        #region Dropping Kaggle API Credentials from Operating Systen's Environment
        
        # Dropping the User-Name.
        del environ['KAGGLE_USERNAME']
        
        # Dropping the API-Key.
        del environ['KAGGLE_KEY']
        
        #endregion
        
        #region Removing All CSV Files

        # Iterating over each file in the current working directory.
        for file in listdir():

            # Proceed when found a CSV file.            
            if file.endswith('.csv'):

                # Removing CSV file found in CWD.
                remove(file)
        
        #endregion
        
        #endregion
        
        # Return the stored dataset after the file has been deleted.
        return dataset
    
    else:
        print('Failed to find the file for the API credentials.\nPlease validate the file path and OS restrictions.')
        return None

In [48]:
# Fetching the required dataset from Kaggle.
train_dataset = fetch_dataset()

In [49]:
# First look on the dataset.
train_dataset

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [50]:
# Understanding the datatypes and null-values.
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [51]:
# Analyzing statistics in data.
train_dataset.describe(include = 'all')

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
count,74681.0,74681,74681,73995.0
unique,,32,4,69490.0
top,,TomClancysRainbowSix,Negative,
freq,,2400,22542,172.0
mean,6432.640149,,,
std,3740.423819,,,
min,1.0,,,
25%,3195.0,,,
50%,6422.0,,,
75%,9601.0,,,


# Data Cleaning

## Identifying Missing Data

In [52]:
# Listing number of missing values in the dataset.
train_dataset.isna().sum()

2401                                                       0
Borderlands                                                0
Positive                                                   0
im getting on borderlands and i will murder you all ,    686
dtype: int64

In [58]:
# Identifying the proportion of the data missing.
train_dataset.isna().mean() * 100

2401                                                     0.000000
Borderlands                                              0.000000
Positive                                                 0.000000
im getting on borderlands and i will murder you all ,    0.918574
dtype: float64

## Identifying Data Duplication

In [55]:
# Identifying number of duplicate records and its average.
train_dataset.duplicated().sum(), train_dataset.duplicated().mean() * 100

(2700, 3.615377405230246)

## Handling Duplicated and Missing Data

In [59]:
# Dropping the duplicate records.
train_dataset.drop_duplicates(inplace = True)

# Dropping the missing values due to its proportion.
train_dataset.dropna(inplace = True)

In [63]:
# Validating the operation on duplicate data.
train_dataset.duplicated().sum()

0

In [65]:
# Validating the operation on missing data.
train_dataset.isna().sum()

2401                                                     0
Borderlands                                              0
Positive                                                 0
im getting on borderlands and i will murder you all ,    0
dtype: int64

## Fixing Feature Naming Convensions

In [71]:
# Defining proper name for the features.
new_features_name = [
    'Index',
    'Topic',
    'Sentiments',
    'Text'
]

# Changing the names of the features for preventing confusion for future analysis and predictions.
train_dataset.rename(columns = {
                        old_feature_name : new_feature_name for old_feature_name, new_feature_name in zip(list(train_dataset.columns), new_features_name)
                     },
                     inplace = True)

In [72]:
# Validating the renaming operation.
train_dataset.columns

Index(['Index', 'Topic', 'Sentiments', 'Text'], dtype='object')