In [15]:
import numpy as numpy
import pandas as pd
import chardet

#### Knowing Encoding of Dataset

In [18]:
def detect_encoding(file_path):
    """
    Detects the encoding of a given file using the chardet library.

    Args:
        file_path (str): Path to the file.

    Returns:
        str: Detected encoding.
    """
    with open(file_path, 'rb') as f:
        rawdata = f.read()
    return chardet.detect(rawdata)['encoding']

# Example usage
file_path = 'Datasets/train.csv'  # Replace with the actual file path
encoding = detect_encoding(file_path)
print(f"Detected encoding: {encoding}")

Detected encoding: Windows-1252


#### Loading Dataset

In [39]:
pd.set_option("display.precision", 2)

In [40]:
df = pd.read_csv(file_path, encoding="Windows-1252")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,653000.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2380000.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1250000.0,26


#### Droping Unwanted Columns

In [41]:
df.columns

Index(['textID', 'text', 'selected_text', 'sentiment', 'Time of Tweet',
       'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)',
       'Density (P/Km²)'],
      dtype='object')

In [42]:
df.drop(['textID', 'selected_text', 'Land Area (Km²)', 'Age of User'], axis=1, inplace=True)
df.head()

Unnamed: 0,text,sentiment,Time of Tweet,Country,Population -2020,Density (P/Km²)
0,"I`d have responded, if I were going",neutral,morning,Afghanistan,38928346,60
1,Sooo SAD I will miss you here in San Diego!!!,negative,noon,Albania,2877797,105
2,my boss is bullying me...,negative,night,Algeria,43851044,18
3,what interview! leave me alone,negative,morning,Andorra,77265,164
4,"Sons of ****, why couldn`t they put them on t...",negative,noon,Angola,32866272,26


#### Creating Columns

In [43]:
# For Char Count
def create_word_count(text):
    text = str(text)
    text = text.split()
    return len(text)

# For Word Count
def create_char_count(text):
    text = str(text)
    count = 0
    for i in text:
        count += 1
    return count

df['word_count'] = df['text'].apply(create_word_count)
df['char_count'] = df['text'].apply(create_char_count)
df.head()

Unnamed: 0,text,sentiment,Time of Tweet,Country,Population -2020,Density (P/Km²),word_count,char_count
0,"I`d have responded, if I were going",neutral,morning,Afghanistan,38928346,60,7,36
1,Sooo SAD I will miss you here in San Diego!!!,negative,noon,Albania,2877797,105,10,46
2,my boss is bullying me...,negative,night,Algeria,43851044,18,5,25
3,what interview! leave me alone,negative,morning,Andorra,77265,164,5,31
4,"Sons of ****, why couldn`t they put them on t...",negative,noon,Angola,32866272,26,14,75


In [45]:
df.to_csv('Datasets/processed_train.csv')