In [1]:
import pandas as pd
import re
from datetime import datetime
import csv

In [2]:
#loading the data, putting column labels, dsiplays data
file_path = 'twitter.csv'
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv(file_path, names=column_names)
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [18]:
#takes a string text as input and performs several cleaning operations on it
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) #remove URLs
    text = re.sub(r'\@\w+', '', text) #remove mentions

    pattern = re.compile(r'(\#)(\w+)')
    match = pattern.search(text)
    if match:
        replace = match.group(2)
    else:
        replace = ''

    text = re.sub(r'\#\w+', replace, text) #remove tags, keep word
    text = re.sub(r'\n', ' ', text) #replace new lines with spaces
    text = re.sub(r'[^\w\s]', '', text) #remove punctuations
    text = text.split()
    return text

In [19]:
#apply cleaning functions
df['text'] = df['text'].apply(clean_text)
df['date'] = pd.to_datetime(df['date'], errors='coerce') #convert date column to datetime, invalid parsing will be set as NaT

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   target  1600000 non-null  int64         
 1   ids     1600000 non-null  int64         
 2   date    1600000 non-null  datetime64[ns]
 3   flag    1600000 non-null  object        
 4   user    1600000 non-null  object        
 5   text    1600000 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 85.4+ MB


In [21]:
df = df[df['target'].isin([0, 2, 4])] #ensure target column has only valid values (0, 2, 4)

In [22]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"[A, thats, a, bummer, You, shoulda, got, David..."
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,"[is, upset, that, he, cant, update, his, Faceb..."
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,"[I, dived, many, times, for, the, ball, Manage..."
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,"[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"[no, its, not, behaving, at, all, im, mad, why..."


In [23]:
df.isnull().sum() #check for sum of null values in each column

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   target  1600000 non-null  int64         
 1   ids     1600000 non-null  int64         
 2   date    1600000 non-null  datetime64[ns]
 3   flag    1600000 non-null  object        
 4   user    1600000 non-null  object        
 5   text    1600000 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 85.4+ MB


In [26]:
df.to_csv('twitter_cleaned.csv', index=False) # save the cleansed data