# Spark NLP Installation

In [None]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8

# Install Spark NLP Display lib
! pip install --upgrade -q spark-nlp-display

In [None]:
import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType


import json
import pandas as pd
import numpy as np
import random

In [None]:
spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.2.8
Apache Spark version: 3.3.0


In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

sentence_detector = SentenceDetector() \
    .setInputCols('document') \
    .setOutputCol('sentence')


tokenizer = Tokenizer() \
    .setInputCols(['sentence']) \
    .setOutputCol('token')


embeddings = WordEmbeddingsModel.pretrained("bengali_cc_300d", "bn") \
.setInputCols(["sentence", "token"]) \
.setOutputCol("embeddings")

ner_model = NerDLModel.pretrained("bengaliner_cc_300d", "bn") \
.setInputCols(["document", "token", "embeddings"]) \
.setOutputCol("ner")

ner_converter = NerConverter() \
    .setInputCols(['sentence', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

nlp_pipeline = Pipeline(
    stages=[
        documentAssembler,
        sentence_detector,
        tokenizer,
        embeddings,
        ner_model,
        ner_converter])



bengali_cc_300d download started this may take some time.
Approximate size to download 818.3 MB
[OK!]
bengaliner_cc_300d download started this may take some time.
Approximate size to download 14.2 MB
[OK!]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# text_list = ["""১৯৪৮ সালে তিশা নিশো মুন্সিগঞ্জ উচ্চ বিদ্যালয় থেকে মেট্রিক পাশ করেন""",
#              """ এবং ১৯৫০ সালে তিশা আপু মুন্সিগঞ্জ হরগঙ্গা কলেজ থেকে ইন্টারমেডিয়েট পাশ করেন."""]

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Thesis/Gender_Converted_Dataset/Final_Toxicity_Augmented_dataset - Final_Toxicity_Augmented_dataset.csv")
df.head()

Unnamed: 0,original sentence,converted sentence
0,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান ভাই,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান আফা
1,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান ভাই,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান বোন
2,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান ভাই,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান ভাবী
3,এসে দেখে মেয়ে হাসছে...,এসে দেখে ছেলে হাসছে...
4,"পরী তকমা লাগানো, হট সেক্স ভিডিও কবে আপলোড করেছ...","জ্বিন তকমা লাগানো, হট সেক্স ভিডিও কবে আপলোড কর..."


In [None]:
text_list = df["converted sentence"]
text_list =text_list.to_frame()
text_list.head()

Unnamed: 0,converted sentence
0,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান আফা
1,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান বোন
2,অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান ভাবী
3,এসে দেখে ছেলে হাসছে...
4,"জ্বিন তকমা লাগানো, হট সেক্স ভিডিও কবে আপলোড কর..."


In [None]:
# df.info()

In [None]:
# new_df = new_df.dropna()

In [None]:
# gender_term= pd.read_csv('/content/drive/MyDrive/Thesis/male_female-2.csv')
# gender_term.head()

In [None]:
# practice_df = new_df.head(50)

In [None]:
# practice_df.head()

In [None]:
# male_gender_term_list = gender_term['Masculine'].tolist()
# male_gender_term_set= set(male_gender_term_list)
# female_gender_term_list = gender_term['Feminine'].tolist()
# female_gender_term_set = set(female_gender_term_list)

# print(male_gender_term_set)
# print(female_gender_term_set)

In [None]:
# from pyspark.sql import Row

# total_list = []

# def per_name_find(text_list):
#     # List to store data for each row
#     rows = []

#     # Process each text in the text_list
#     for sentence in text_list:
#         # Get NER prediction from model
#         df = spark.createDataFrame([sentence], StringType()).toDF("text")
#         result = nlp_pipeline.fit(df).transform(df)
#         answerlist = result.toPandas().loc[:, "ner_chunk"].tolist()

#         # List to store PERSON names in the current sentence
#         per_names = []

#         # Check if there are any NER predictions
#         if len(answerlist[0]) > 0:
#             for x in answerlist[0]:
#                 if x["metadata"]['entity'] == 'PER':
#                     per_names.append(x["result"])

#         # Append data for the current sentence to the list of rows
#         rows.append(Row(Sentence=sentence, Name=per_names))

#     # Create a DataFrame from the list of rows
#     df_result = spark.createDataFrame(rows)

#     # Print the DataFrame
#     df_result.show(truncate=False)

#     return df_result

# # Assuming male_name_change function is called with text_list as input
# per_name_find(text_list)


In [None]:
# from pyspark.sql import Row
# from pyspark.sql.types import StringType

# def per_name_find(text_list):
#     # List to store data for each row
#     rows = []

#     # Process each text in the text_list
#     for sentence in text_list:
#         # Get NER prediction from model
#         df = spark.createDataFrame([sentence], StringType()).toDF("text")
#         result = nlp_pipeline.fit(df).transform(df)
#         answerlist = result.toPandas().loc[:, "ner_chunk"].tolist()

#         # List to store PERSON names in the current sentence
#         per_names = []

#         # Check if there are any NER predictions
#         if len(answerlist[0]) > 0:
#             for x in answerlist[0]:
#                 if x["metadata"]['entity'] == 'PER':
#                     per_names.append(x["result"])

#         # Append data for the current sentence to the list of rows
#         rows.append(Row(Converted_sentence=sentence, Name=per_names))

#     # Create a DataFrame from the list of rows
#     df_result = spark.createDataFrame(rows)

#     # Print the DataFrame
#     df_result.show(truncate=False)

#     return df_result

# # Assuming text_list is a column from DataFrame df
# text_list = df["converted sentence"]

# # Call per_name_find function with text_list as input
# per_name_find(text_list)


In [None]:
from pyspark.sql import Row
from pyspark.sql.types import StringType

def per_name_find(text_list):
    # List to store data for each row
    rows = []

    # Process each text in the text_list
    for sentence in text_list:
        # Get NER prediction from model
        df = spark.createDataFrame([sentence], StringType()).toDF("text")
        result = nlp_pipeline.fit(df).transform(df)
        answerlist = result.toPandas().loc[:, "ner_chunk"].tolist()

        # List to store PERSON names in the current sentence
        per_names = []

        # Check if there are any NER predictions
        if len(answerlist[0]) > 0:
            for x in answerlist[0]:
                if x["metadata"]['entity'] == 'PER':
                    per_names.append(x["result"])

        # Append data for the current sentence to the list of rows
        rows.append(Row(Converted_sentence=str(sentence), Name=per_names))  # Convert sentence to string

    # Create a DataFrame from the list of rows
    df_result = spark.createDataFrame(rows)

    # Print the DataFrame
    df_result.show(truncate=False)

    return df_result

# Assuming text_list is a column from DataFrame df
text_list = df["converted sentence"].astype(str)  # Convert column to string

df_result = per_name_find(text_list)

# # Call per_name_find function with text_list as input
# per_name_find(text_list)


+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+
|Converted_sentence                                                                                                                                                                         |Name                     |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+
|অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান আফা                                                                                                                                                       |[আরিয়ান আফা]             |
|অসাধারণ ১টা নাটক😢ধন্যবাদ আরিয়ান বোন                                                                                                     

In [None]:
# df_result = per_name_find(text_list)
# Convert Spark DataFrame to Pandas DataFrame
df_result_pandas = df_result.toPandas()

# Specify the output path
output_path = "/content/drive/MyDrive/Thesis/NER_Columned_Dataset/Toxicity_Name_Extracted.csv"

# Save the Pandas DataFrame to CSV
df_result_pandas.to_csv(output_path, index=False)

print(f"File saved to {output_path}")

File saved to /content/drive/MyDrive/Thesis/NER_Columned_Dataset/Toxicity_Name_Extracted.csv
