Preprocessing the dataset

In [None]:
import pandas as pd
import numpy as np
import string
import re
import matplotlib.pyplot as plt
import nltk
nltk.download('all')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
data = pd.read_csv("train.csv", encoding="ISO-8859-1")
test = pd.read_csv("test.csv")
data = data.rename(columns={'crimeaditionalinfo': 'text'})
data = data.dropna(subset=['text'])
test = test.rename(columns={'crimeaditionalinfo': 'text'}) # renaming the crime info column
test = test.dropna(subset=['text']) # dropping all entries with no information on the crime

In [None]:
null_count = data['sub_category'].isnull().sum()
data['sub_category'].fillna(data['category'], inplace=True)
test['sub_category'].fillna(data['category'], inplace=True) # replacing the null entries in sub_category column with the category of the complaint
print('null count: ', null_count)
data.groupby('category')['sub_category'].value_counts() # number of entries and sub categories under each categories

In [None]:
mapping = data.set_index('sub_category')['category'].to_dict()
def get_category(sub_category):
    return mapping.get(sub_category)
subcategories = data['sub_category'].unique().tolist()


In [None]:
unique_data_subcategories = set(data['sub_category'].unique())
unique_test_subcategories = set(test['sub_category'].unique())

exclusive_in_data_subcategories = unique_data_subcategories - unique_test_subcategories
exclusive_in_test_subcategories = unique_test_subcategories - unique_data_subcategories

print("Subcategories exclusive to 'train' dataframe:")
print(list(exclusive_in_data_subcategories))
print("\nSubcategories exclusive to 'test' dataframe:")
print(list(exclusive_in_test_subcategories))
exclusive_in_test_subcategories

In [None]:
unique_data_categories = set(data['category'].unique())
unique_test_categories = set(test['category'].unique())

exclusive_in_data = unique_data_categories - unique_test_categories
exclusive_in_test = unique_test_categories - unique_data_categories

exclusive_in_data, exclusive_in_test


In [None]:
data = data[data['word_count'] >= 4]
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].str.lower().apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
test['text'] = test['text'].str.lower().apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

In [None]:
# Removing all instances which are not of type string
data = data[data['text'].apply(lambda x: isinstance(x, str))]
test = test[test['text'].apply(lambda x: isinstance(x, str))]
total_char_count = data['text'].str.len().sum()
total_char_count

In [None]:
def Text_Cleaning(Text):
  Text = Text.lower()
  punc = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  Text = Text.translate(punc)
  Text = re.sub(r'\d+', '', Text)
  Text = re.sub('https?://\S+|www\.\S+', '', Text)
  Text = re.sub('\n', '', Text)
  return Text
Stopwords = set(nltk.corpus.stopwords.words("english")) - set(["not"])

def Text_Processing(Text):
    Processed_Text = list()
    Lemmatizer = WordNetLemmatizer()
    Tokens = nltk.word_tokenize(Text)

    for word in Tokens:
        if word not in Stopwords:
            Processed_Text.append(Lemmatizer.lemmatize(word))

    return " ".join(Processed_Text)

In [None]:
data["text"] = data["text"].apply(lambda Text: Text_Cleaning(Text))
test["text"] = test["text"].apply(lambda Text: Text_Cleaning(Text))
data["text"] = data["text"].apply(lambda Text: Text_Processing(Text))
test["text"] = test["text"].apply(lambda Text: Text_Processing(Text))
total_char_count = data['text'].str.len().sum()
total_char_count

In [None]:
stemmer = PorterStemmer()
data['text'] = data['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(x)]))
test['text'] = test['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(x)]))

In [None]:
data.to_csv('cleaned_train.csv', index=False)
test.to_csv('cleaned_test.csv', index=False)