In [76]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
DATA_DIR = os.path.join(BASE_DIR, "datasets")
train_path = os.path.join(DATA_DIR, "train.csv")
test_path = os.path.join(DATA_DIR, "test.csv")
# Load the dataset (change 'your_dataset.csv' to the actual file path)
train_df = pd.read_csv(train_path)
test_df=pd.read_csv(test_path)

In [77]:
train_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [78]:
train_df['crimeaditionalinfo'] = train_df['crimeaditionalinfo'].str.strip()
train_df['crimeaditionalinfo'] = train_df['crimeaditionalinfo'].replace(['', 'NA', 'null', '?', '-'], np.nan)


In [79]:
print("\nMissing Values in Each Column:")
print(train_df.isnull().sum())


Missing Values in Each Column:
category                 0
sub_category          6591
crimeaditionalinfo    1223
dtype: int64


In [80]:
train_df.describe()

Unnamed: 0,category,sub_category,crimeaditionalinfo
count,93686,87095,92463
unique,15,35,83908
top,Online Financial Fraud,UPI Related Frauds,Respected Sir\r\n\r\nA very serious matter I w...
freq,57434,26856,2342


In [81]:
train_df= train_df.dropna()
print("\nMissing Values in Each Column:")
print(train_df.isnull().sum())


Missing Values in Each Column:
category              0
sub_category          0
crimeaditionalinfo    0
dtype: int64


In [83]:
train_df.describe()

Unnamed: 0,category,sub_category,crimeaditionalinfo
count,85924,85924,85924
unique,11,35,80111
top,Online Financial Fraud,UPI Related Frauds,Financial Fraud
freq,56718,26479,513


In [84]:
from sklearn.preprocessing import LabelEncoder

In [85]:
# Encoding user and product IDs to numeric values
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()
train_df['encoded_user_id'] = user_encoder.fit_transform(train_df['category'])
train_df['encoded_product_id'] = product_encoder.fit_transform(train_df['crimeaditionalinfo'])

In [86]:
train_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo,encoded_user_id,encoded_product_id
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,8,28416
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,6,58919
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,7,21029
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,8,38720
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,6,33665


In [87]:
# Check for duplicates in the 'crimeaditionalinfo' column
duplicates_info_train = train_df['crimeaditionalinfo'].duplicated()  # This returns a boolean Series marks the subsequent records of duplicated records as true
total_duplicates_info_train = duplicates_info_train.sum()  # Total number of duplicate rows in that column

print(f"Total number of duplicate entries in train 'crimeaditionalinfo': {total_duplicates_info_train}")


Total number of duplicate entries in train 'crimeaditionalinfo': 5813


In [88]:
# Mapping user and product IDs
user_id_map = dict(zip(train_df['encoded_user_id'],train_df['category']))
product_id_map = dict(zip(train_df['encoded_product_id'], train_df['crimeaditionalinfo']))

In [89]:
train_df= train_df.dropna()
train_df = train_df.drop_duplicates(subset=['crimeaditionalinfo'])

df = pd.DataFrame(train_df)

In [90]:
# Removal of stopwords and lemmitization

In [92]:
import nltk
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Correct NLTK downloads
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def clean_and_lemmatize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    # Lemmatize with POS tagging
    text = ' '.join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text.split()])
    return text

# Apply function to clean and lemmatize the text data
train_df['cleaned_text'] = train_df['crimeaditionalinfo'].apply(clean_and_lemmatize)



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\pavin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\pavin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [93]:
train_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo,encoded_user_id,encoded_product_id,cleaned_text
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,8,28416,i have continue receive random call and abusiv...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,6,58919,the above fraudster be continuously message me...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,7,21029,he be act like a police and demand for money b...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,8,38720,in apna job i have apply for job interview for...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,6,33665,i receive a call from lady state that she will...


In [94]:
#!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from text
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Apply stopword removal
train_df['cleaned_text']= train_df['cleaned_text'].apply(remove_stopwords)
train_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,category,sub_category,crimeaditionalinfo,encoded_user_id,encoded_product_id,cleaned_text
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,8,28416,continue receive random call abusive message w...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,6,58919,fraudster continuously message ask pay money s...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,7,21029,act like police demand money add section text ...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,8,38720,apna job apply job interview telecalling resou...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,6,33665,receive call lady state send new phone vivo re...


In [95]:
# Check for duplicates in the 'crimeaditionalinfo' column
duplicates_info_train = train_df['crimeaditionalinfo'].duplicated()  # This returns a boolean Series marks the subsequent records of duplicated records as true
total_duplicates_info_train = duplicates_info_train.sum()  # Total number of duplicate rows in that column

print(f"Total number of duplicate entries in train 'crimeaditionalinfo': {total_duplicates_info_train}")


Total number of duplicate entries in train 'crimeaditionalinfo': 0


In [96]:
train_df = train_df.drop_duplicates(subset=['crimeaditionalinfo'])



In [97]:
##TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))  # Adjust max_features based on dataset size
tfidf_matrix = tfidf.fit_transform(train_df['cleaned_text'])

# Convert to DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())


In [98]:
## tokenization and stemming

In [100]:
#!pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('stopwords')
import nltk
nltk.download('punkt_tab')


# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to tokenize, remove stopwords, and stem words in text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and apply stemming
    processed_text = ' '.join([stemmer.stem(word) for word in tokens if word not in stop_words])
    return processed_text

# Apply the preprocessing function
train_df['cleaned'] = train_df['cleaned_text'].apply(preprocess_text)

# Display the processed data
train_df.head()
train_df.to_csv(os.path.join(DATA_DIR,"preprocessed_data1.csv"), index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pavin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
from py2neo import Graph, Node, Relationship
from neo4j import GraphDatabase

# Connect to the database
uri = "bolt://localhost:7687"
auth = ("neo4j", "paVi@9731")  # Replace with your credentials
driver = GraphDatabase.driver(uri, auth=auth)

# Function to create knowledge graph
def create_knowledge_graph(tx, category, crimeaditionalinfo):
    query = """
    MERGE (c:Category {name: $category})
    MERGE (comp:Complaint {info: $crimeaditionalinfo})
    MERGE (c)-[:HAS_COMPLAINT]->(comp)
    """
    tx.run(query, category=category, crimeaditionalinfo=crimeaditionalinfo)
df = df.dropna(subset=['category', 'crimeaditionalinfo'])  # Drop rows with NaN in required columns

# Limit to the first 10,000 rows
limited_df = df[:20000]

# Process the limited dataset
with driver.session() as session:
    for index, row in limited_df.iterrows():
        session.execute_write(create_knowledge_graph, row['category'], row['crimeaditionalinfo'])

driver.close()
