### Notebook for Data preprocessing tasks

##### Importing Required dependencies

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer




In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sudip.pokhrel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### Importing the dataset

In [14]:
# importing data into pandas df for preprocessing
twitter_df = pd.read_csv('../dataset/training.1600000.processed.noemoticon.csv', encoding='iSO-8859-1')

In [15]:
# Checking the Shape
twitter_df.shape

(1599999, 6)

In [16]:
# first 5 rows of the dataframe
twitter_df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [21]:
twitter_df['0'].unique()

array([0, 4])

##### Data is missing Header so we Import again with headers

In [23]:
column_names = ['target', 'id','date','flag','user','text']
tw_df = pd.read_csv('../dataset/training.1600000.processed.noemoticon.csv',names= column_names, encoding='iSO-8859-1')

In [24]:
tw_df.shape

(1600000, 6)

In [27]:
# printning first 4 rows of the dataframe
tw_df.head(4)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire


In [28]:
# counting missing values in dataset
tw_df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [29]:
# checking the distribution of target column
tw_df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

It Shows that there are half positive(target value 4) and half negative sentiment(target value 0) in our dataset




> **Now, We convert 
 the target from "4" to "1"**



>  **0 --> Negative Tweet**


>  **1 --> Positive Tweet**


In [36]:
tw_df.replace({'target':{4:1}}, inplace =True)

In [None]:
# Veryfying the Unique words in the Target Column
tw_df['target'].unique()

array([0, 1])

##### Now, We drop COlumns that will not have impact on result

In [50]:
# Specify the columns to drop
columns_to_drop = ["id", "date", "flag", "user"]  # These column will have no impact on sentiment

# Drop the columns
df = tw_df.drop(columns=columns_to_drop)

# Display the first few rows of the updated DataFrame
print(df.head())


   target                                               text
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1       0  is upset that he can't update his Facebook by ...
2       0  @Kenichan I dived many times for the ball. Man...
3       0    my whole body feels itchy and like its on fire 
4       0  @nationwideclass no, it's not behaving at all....


In [53]:
df['text'][0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

##### Remove all special and numeric character from data

In [None]:

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r"@\w+", '', text) # Remove mentions (@username)
    text = re.sub(r"#", '', text)  # Remove hashtags (only # symbol, keep the text)
    text = re.sub(r"[^\w\s']", '', text) # Remove punctuation
    text = re.sub(r"\d+", '', text) # Remove numbers
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters (e.g., emojis)
    text = re.sub(r"\s+", ' ', text).strip() # Remove extra whitespace
    
    return text


In [None]:
# Apply cleaning to your dataset
df['text'] = df['text'].apply(clean_text)
# Convert the 'Text' column to lowercase
df['text'] = df['text'].str.lower()


In [63]:
df['text'][0]

"a that's a bummer you shoulda got david carr of third day to do it d"

#### Removing the Stopwords

In [66]:
# Load the stopwords list
stop_words = set(stopwords.words('english'))

# Exclude critical stopwords (negation words and others important for sentiment)
important_words = {
    "against", "although", "because", "but", "can't", "couldn't", "despite", "doesn't", "don't", 
    "enough", "however", "isn't", "just", "never", "no", "nor", "not", "only", "quite", 
    "shouldn't", "so", "though", "too", "very", "wasn't", "won't", "wouldn't", "yet","isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't",
    "don't", "doesn't", "didn't", "won't", "wouldn't", "shan't", "shouldn't",
    "can't", "couldn't", "mustn't", "mightn't", "needn't"
}

# Subtract important words from stopwords
custom_stop_words = stop_words - important_words

In [68]:
# Function to remove stopwords
def remove_stopwords(text):
    # Tokenize the text and filter out stopwords
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in custom_stop_words]
    return " ".join(filtered_words)

In [77]:
from joblib import Parallel, delayed

df['text'] = Parallel(n_jobs=-1)(delayed(remove_stopwords)(text) for text in df['text'])


In [78]:
df.head()

Unnamed: 0,target,text
0,0,that's bummer shoulda got david carr third day
1,0,upset can't update facebook texting might cry ...
2,0,dived many times ball managed save rest go bounds
3,0,whole body feels itchy like fire
4,0,no not behaving i'm mad because can't see


#### Stemming

In [None]:
# Initialize the SnowballStemmer for English
stemmer = SnowballStemmer('english')

Number of CPU cores available: 8
