In [None]:
import numpy as np
import pandas as pd
import logging
from sklearn.preprocessing import MinMaxScaler

class DataUtils:
    def __init__(self) -> None:
        pass

    def load_different_data(self):
        logging.info("Loading Jigsaw, C3 and ruddit data")
        try:

            Jigsaw = pd.read_csv(
                "../input/alldataraw/train_data.csv"
            )
            c3_data = pd.read_csv(
                "../input/alldataraw/C3_anonymized.csv"
            )
            ruddit_data = pd.read_csv(
                "../input/alldataraw/ruddit_with_text.csv"
            )
            logging.info("Data loaded successfully")
            return Jigsaw, c3_data, ruddit_data

        except Exception as e:
            logging.error(f"Error loading data: {e}")
            raise e

    def prepare_data(self):
        logging.info("Preparing data for Further processing")
        try:
            Jigsaw, c3_data, ruddit_data = self.load_different_data()
            Jigsaw.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)
            # drop all the columns except comment_text and agree_toxicity_expt
            c3_data = c3_data[["comment_text", "agree_toxicity_expt"]]
            ruddit_data = ruddit_data[["txt", "offensiveness_score"]] 
            # rename the columns 
            c3_data.columns = ["text", "y"] 
            ruddit_data.columns = ["text", "y"]
            # scaling all target values between 0-1
            scaler = MinMaxScaler()
            ruddit_data["y"] = scaler.fit_transform(ruddit_data[['y']])
            c3_data["y"] = scaler.fit_transform(c3_data[["y"]])
            Jigsaw["y"] = scaler.fit_transform(Jigsaw[["y"]])
            logging.info("Data prepared successfully")
            return Jigsaw, c3_data, ruddit_data
        except Exception as e:
            logging.error(f"Error preparing data: {e}")
            raise e

    def concatenate_dfs(self, Jigsaw, c3_data, ruddit_data): 
        logging.info("Concatenating dataframes") 
        try: 
            # concatenate all the dataframes
            data = pd.concat([Jigsaw, c3_data, ruddit_data], ignore_index=True)
            logging.info("Data concatenated successfully")
            return data 
        except Exception as e: 
            logging.error(f"Error concatenating data: {e}")
            raise e 

In [None]:
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud
from wordcloud import WordCloud, STOPWORDS



class EDA:
    def __init__(self, data, jigsaw, c3data, ruddit) -> None:
        """ 
        Input: 
            data: dataframe ( concatenated with Jigsaw, C3 and ruddit data) 
            jigsaw: dataframe (Jigsaw data) 
            c3data: dataframe (C3 data) 
            ruddit: dataframe (ruddit data)
        """
        self.data = data
        self.jigsaw = jigsaw
        self.c3data = c3data
        self.ruddit = ruddit

    def data_info(self):
        """ 
        Basic Data Exploration like shape, info and etc 
        """
        logging.info("Basic Data Exploration")
        try:
            print("Shape of the data: ", self.data.shape)
            print("Info of the data: ", self.data.info())
            print("Data types: ", self.data.dtypes)
            print("Data head: ", self.data.head())
        except Exception as e:
            logging.error(f"Error in basic data exploration: {e}")
            raise e

    def exploring_distributions(self):
        """ 
        distributions of target labels 
        """
        # exploring the distribution of target labels
        logging.info("Exploring the distribution of target labels")
        try:
            print("Showing Distribution of concatenated data")
            sns.distplot(self.data["y"])
            plt.show()
            print("Showing Distribution of Jigsaw data")
            sns.distplot(self.jigsaw["y"])
            plt.show()
            print("Showing Distribution of C3 data")
            sns.displot(self.c3data["y"])
            plt.show()
            print("Showing Distribution of ruddit data")
            sns.distplot(self.ruddit["y"])
            plt.show()
        except Exception as e:
            logging.error(f"Error in exploring distribution: {e}")
            raise e

    def word_cloud(self):
        """ 
        WordCloud 
        """
        logging.info("WordCloud of the data")
        try:
            print("WordCloud of the data")
            # Generate a word cloud image
            wordcloud = WordCloud(
                background_color="white",
                stopwords=STOPWORDS,
                max_words=200,
                max_font_size=40,
                random_state=42,
            ).generate(str(self.data["text"]))

            # Display the generated image:
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.show()
            print("WordCloud of the Jigsaw data")
            wordcloud = WordCloud(
                background_color="white",
                stopwords=STOPWORDS,
                max_words=200,
                max_font_size=40,
                random_state=42,
            ).generate(str(self.jigsaw["text"]))

            # Display the generated image:
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.show()
            print("WordCloud of the C3 data")
            wordcloud = WordCloud(
                background_color="white",
                stopwords=STOPWORDS,
                max_words=200,
                max_font_size=40,
                random_state=42,
            ).generate(str(self.c3data["text"]))

            # Display the generated image:
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.show()
            print("WordCloud of the ruddit data")
            wordcloud = WordCloud(
                background_color="white",
                stopwords=STOPWORDS,
                max_words=200,
                max_font_size=40,
                random_state=42,
            ).generate(str(self.ruddit["text"]))

            # Display the generated image:
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.show()
        except Exception as e:
            logging.error(f"Error in word cloud: {e}")
            raise e


# if __name__ == "__main__":
#     # =======================================================================
#     # Load data
#     data_utils = DataUtils()
#     jigsaw_data, c3_data, ruddit_data = data_utils.load_different_data()
#     Jigsaw, c3_data, ruddit_data = data_utils.prepare_data()
#     data = data_utils.concatenate_dfs(Jigsaw, c3_data, ruddit_data)
#     # =======================================================================

#     # =======================================================================
#     # Exploring data
#     eda = EDA(data, jigsaw_data, c3_data, ruddit_data)
#     eda.data_info() 
#     eda.exploring_distributions() 
#     eda.word_cloud() 
#     eda.skeweness_and_kurtosis() 
    # ========================================================================

In [None]:
import numpy as np
import pandas as pd
import logging
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

class DataProcess:
    def __init__(self, data) -> None:
        self.data = data

    def check_null_values(self):
        """ 
        Check for null values in dataframe 
        """
        logging.info("Checking for null values in dataframe")
        try:
            print("Checking for null values in dataframe")
            print(self.data.isnull().sum())
        except Exception as e:
            logging.error(f"Error in checking null values: {e}")
            raise e

    def apply_all_processing_on_train_test_data(self):
        # apply all the Review processing methods on train and test data
        logging.info(
            "Applying all the Review processing methods on train and test data"
        )
        try:
            self.data["text"] = self.data["text"].apply(
                lambda x: self.Review_processing(x)
            )
            self.data["text"] = self.data["text"].apply(
                lambda x: self.remove_punctuation(x)
            )
            self.data["text"] = self.data["text"].apply(
                lambda x: self.remove_numbers(x)
            )
            self.data["text"] = self.data["text"].apply(
                lambda x: self.remove_special_characters(x)
            )
            self.data["text"] = self.data["text"].apply(
                lambda x: self.remove_short_words(x)
            )
            self.data["text"] = self.data["text"].apply(
                lambda x: self.remove_stopwords(x)
            )
            self.data["text"] = self.data["text"].apply(
                lambda x: self.lemmatization(x)
            )

            return self.data 

        except Exception as e:
            logging.error(
                "Error in applying all the Review processing methods on train and test data"
            )
            logging.error(e)
            return None

    def Review_processing(self, Review):
        logging.info("Applying Review processing methods on train and test data")
        try:
            Review = Review.lower()
            Review = Review.replace("\n", " ")
            Review = Review.replace("\r", " ")
            Review = Review.replace("\t", " ")
            Review = Review.replace("\xa0", " ")
            Review = Review.replace("\u200b", " ")
            Review = Review.replace("\u200c", " ")
            Review = Review.replace("\u200d", " ")
            Review = Review.replace("\ufeff", " ")
            Review = Review.replace("\ufeef", " ")
        except Exception as e:
            logging.error(
                "Error in applying Review processing methods on train and test data"
            )
            logging.error(e)
            return None
        return Review

    def stemming(self, Review):
        logging.info("Applying stemming methods on train and test data")
        try:
            Review = Review.split()
            ps = PorterStemmer()
            Review = [ps.stem(word) for word in Review]
            Review = " ".join(Review)
        except Exception as e:
            logging.error("Error in applying stemming methods on train and test data")
            logging.error(e)
            return None
        return Review

    def lemmatization(self, Review):
        Review = Review.split()
        lem = WordNetLemmatizer()
        Review = [lem.lemmatize(word) for word in Review]
        Review = " ".join(Review)
        return Review

    def remove_stopwords(self, Review):
        Review = Review.split()
        stop_words = set(stopwords.words("english"))
        Review = [word for word in Review if not word in stop_words]
        Review = " ".join(Review)
        return Review

    def remove_punctuation(self, Review):
        # remove all punctuation except full stop, exclaimation mark and question mark
        Review = Review.split()
        Review = [word for word in Review if word.isalpha()]
        Review = " ".join(Review)

        return Review

    def remove_numbers(self, Review):
        Review = Review.split()
        Review = [word for word in Review if not word.isnumeric()]
        Review = " ".join(Review)
        return Review

    def remove_special_characters(self, Review):
        Review = Review.split()
        Review = [word for word in Review if word.isalpha()]
        Review = " ".join(Review)
        return Review

    def remove_short_words(self, Review):
        Review = Review.split()
        Review = [word for word in Review if len(word) > 2]
        Review = " ".join(Review)
        return Review

    def remove_stopwords_and_punctuation(self, Review):
        Review = Review.split()
        stop_words = set(stopwords.words("english"))
        Review = [word for word in Review if not word in stop_words]
        Review = [word for word in Review if word.isalpha()]
        Review = " ".join(Review)
        return Review

    def remove_stopwords_and_punctuation_and_numbers(self, Review):
        Review = Review.split()
        stop_words = set(stopwords.words("english"))
        Review = [word for word in Review if not word in stop_words]
        Review = [word for word in Review if word.isalpha()]
        Review = [word for word in Review if not word.isnumeric()]
        Review = " ".join(Review)
        return Review

    def remove_nan_values(self, df):
        # fill nan values with UNKOWN and return the dataframe
        df = df.fillna("UNKOWN")
        return df

class DataValidation:
    def __init__(self , data) -> None:
        self.data = data
    
    def data_splitting(self): 
        logging.info("Data Splitting")
        try: 
            X = self.data.drop('y' , axis=1)
            Y = self.data['y']

            x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25)
            return x_train, x_test, y_train, y_test 
        except Exception as e:
            logging.error(f"Error loading data: {e}")
            raise e   
# if __name__ == "__main__":
#     # data_utils = DataUtils()
#     # jigsaw_data, c3_data, ruddit_data = data_utils.load_different_data()
#     # Jigsaw, c3_data, ruddit_data = data_utils.prepare_data()
#     # data = data_utils.concatenate_dfs(Jigsaw, c3_data, ruddit_data)
#     # data_process = DataProcess(data, jigsaw_data, c3_data, ruddit_data)
#     # data_process.check_null_values()
#     # data_processed = data_process.apply_all_processing_on_train_test_data() 
#     # data_processed.head() 
#     pass 

In [None]:
import numpy as np 
import pandas as pd 
import logging  
from sklearn.feature_extraction.text import TfidfVectorizer 
import joblib  
import nltk
from nltk import sent_tokenize,word_tokenize

class FeatureEngineering:

    def __init__(self,data) -> None:
        self.data = data

    def get_count_of_words(self):
        # get count of words in data
        logging.info("Get count of words in data")
        word_count = self.data["text"].apply(lambda x: len(x.split()))
        return word_count
    
    def get_count_of_sentences(self):
        # get count of sentences in data
        logging.info("Get count of sentences in data")
        sentences_count = self.data["text"].apply(
            lambda x: len(sent_tokenize(x))
        )
        return sentences_count

    def get_average_word_length(self):
        # get average word length in data
        logging.info("Get average word length in data")
        average_word_length = self.data["text"].apply(
            lambda x: np.mean([len(word) for word in x.split()])
        )
        return average_word_length

    def get_average_sentence_length(self):
        # get average sentence length in data
        logging.info("Get average sentence length in data")
        average_sentence_length = self.data["text"].apply(
            lambda x: np.mean([len(sentence) for sentence in sent_tokenize(x)])
        )
        return average_sentence_length

    def get_average_sentence_complexity(self):
        # get average sentence complexity in data
        logging.info("Get average sentence complexity in data")
        average_sentence_complexity = self.data["text"].apply(
            lambda x: np.mean(
                [len(word_tokenize(sentence)) for sentence in sent_tokenize(x)]
            )
        )
        return average_sentence_complexity

    def get_average_word_complexity(self):
        # get average word complexity in data
        logging.info("Get average word complexity in data")
        average_word_complexity = self.data["text"].apply(
            lambda x: np.mean([len(word_tokenize(word)) for word in x.split()])
        )
        return average_word_complexity

    def add_features(self):
        logging.info("Add features")
        word_count = self.get_count_of_words()
        sentences_count = self.get_count_of_sentences()
        average_word_length = self.get_average_word_length()
        average_sentence_length = self.get_average_sentence_length()
        average_sentence_complexity = self.get_average_sentence_complexity()
        average_word_complexity = self.get_average_word_complexity()
        
        self.data["count_of_words"] = word_count
        self.data["count_of_setences"] = sentences_count
        self.data["average_word_length"] = average_word_length
        self.data["average_sentence_length"] = average_sentence_length
        self.data["average_sentence_complexity"] = average_sentence_complexity
        self.data["average_word_complexity"] = average_word_complexity

        return self.data


class Vectorization:  
    """ 
    NOT BEING USED
    """
    def __init__(self, df) -> None:
        self.df = df 

    def vectorize(self) -> pd.DataFrame:  
        """ 
        Only vectorize concatenated data 
        """
        vectorizer = TfidfVectorizer(max_features=5000)

        extracted_data = list(
            vectorizer.fit_transform(self.df["text"]).toarray()
        )
        extracted_data = pd.DataFrame(extracted_data)
        extracted_data.head()
        extracted_data.columns = vectorizer.get_feature_names()

        vocab = vectorizer.vocabulary_
        mapping = vectorizer.get_feature_names()
        keys = list(vocab.keys())

        extracted_data.shape
        Modified_df = extracted_data.copy()
        print(Modified_df.shape)
        Modified_df.head()
        Modified_df.reset_index(drop=True, inplace=True)
        self.df.reset_index(drop=True, inplace=True)

        Final_Training_data = pd.concat([self.df, Modified_df], axis=1)

        Final_Training_data.head()
        print(Final_Training_data.shape)
        Final_Training_data.drop(["text"], axis=1, inplace=True)
        Final_Training_data.head()
        Final_Training_data.to_csv("Final_Training_vectorized.csv", index=False)

        # dff_test = list(vectorizer.transform(self.test_data["Review"]).toarray())
        # vocab_test = vectorizer.vocabulary_
        # keys_test = list(vocab_test.keys())
        # dff_test_df = pd.DataFrame(dff_test, columns=keys_test)
        # dff_test_df.reset_index(drop=True, inplace=True)
        # self.test_data.reset_index(drop=True, inplace=True)
        # Final_Test = pd.concat([self.test_data, dff_test_df], axis=1)
        # Final_Test.drop(["Review"], axis=1, inplace=True)
        # Final_Test.to_csv("Final_Test_vectorized", index=False)

        # save the vectorizer to disk
        joblib.dump(vectorizer, "vectorizer.pkl")
        return Final_Training_data


# if __name__ == '__main__': 
#     # data_utils = DataUtils()
#     # jigsaw_data, c3_data, ruddit_data = data_utils.load_different_data()
#     # Jigsaw, c3_data, ruddit_data = data_utils.prepare_data()
#     # data = data_utils.concatenate_dfs(Jigsaw, c3_data, ruddit_data)
#     # data = data.sample(5000)
#     # # data.reset_index(inplace=True)
    
#     # data_process = DataProcess(data, jigsaw_data, c3_data, ruddit_data)
#     # data_process.check_null_values()
#     # data_processed = data_process.apply_all_processing_on_train_test_data() 
#     # # vectrorize = Vectorization(data) 
#     # # Final_Training_data = vectrorize.vectorize() 
#     # print(data.shape)
#     # print(data.head())
#     # fe = FeatureEngineering(data)
#     # data = fe.add_features()
#     # print(data.shape)
#     # print(data.head())

In [None]:
from joblib.externals.loky.backend.spawn import import_main_path
from numpy.lib.function_base import gradient
import pandas as pd
import numpy as np
import logging

from scipy.sparse import data
from scipy.sparse.construct import random
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import optuna
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import catboost
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from mlxtend.regressor import StackingRegressor

class Hyperparameters_Optimization:
    def __init__(self, x_train, y_train, x_test, y_test) -> None:
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def optimize_decisiontrees(self, trial):
        # criterion = trial.suggest_categorical("criterion", ("squared_error", "friedman_mse", "absolute_error", "poisson"))
        max_depth = trial.suggest_int("max_depth", 1, 20)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        reg = DecisionTreeRegressor(
            max_depth=max_depth, min_samples_split=min_samples_split,
        )
        reg.fit(self.x_train, self.y_train)
        val_accuracy = reg.score(self.x_test, self.y_test)
        return val_accuracy

    def optimize_randomforest(self, trial):
        logging.info("optimize_randomforest")
        n_estimators = trial.suggest_int("n_estimators", 1, 200)
        max_depth = trial.suggest_int("max_depth", 1, 20)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        reg = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
        )
        reg.fit(self.x_train, self.y_train)
        val_accuracy = reg.score(self.x_test, self.y_test)
        return val_accuracy

    def Optimize_Adaboost_regressor(self, trial):
        logging.info("Optimize_Adaboost_regressor")
        n_estimators = trial.suggest_int("n_estimators", 1, 200)
        learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.99)
        reg = AdaBoostRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
        reg.fit(self.x_train, self.y_train)
        val_accuracy = reg.score(self.x_test, self.y_test)
        return val_accuracy

    def Optimize_LightGBM(self, trial):
        logging.info("Optimize_LightGBM")
        n_estimators = trial.suggest_int("n_estimators", 1, 200)
        max_depth = trial.suggest_int("max_depth", 1, 20)
        learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.99)
        reg = LGBMRegressor(
            n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth
        )
        reg.fit(self.x_train, self.y_train)
        val_accuracy = reg.score(self.x_test, self.y_test)
        return val_accuracy

    def Optimize_Xgboost_regressor(self, trial):
        logging.info("Optimize_Xgboost_regressor")
        param = {
            "max_depth": trial.suggest_int("max_depth", 1, 30),
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-7, 10.0),
            "n_estimators": trial.suggest_int("n_estimators", 1, 200),
        }
        reg = xgb.XGBRegressor(**param)
        reg.fit(self.x_train, self.y_train)
        val_accuracy = reg.score(self.x_test, self.y_test)
        return val_accuracy

    def Optimize_Catboost_regressor(self, trial):
        logging.info("Optimize_Catboost_regressor")
        param = {
            "iterations": trial.suggest_int("iterations", 1, 200),
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-7, 1.0),
            "depth": trial.suggest_int("depth", 1, 16),
            "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-7, 10.0),
            "border_count": trial.suggest_int("border_count", 1, 20),
            "rsm": trial.suggest_uniform("rsm", 0.5, 1.0),
            "od_type": trial.suggest_categorical(
                "od_type", ("IncToDec", "Iter", "None")
            ),
            "od_wait": trial.suggest_int("od_wait", 1, 20),
            "random_seed": trial.suggest_int("random_seed", 1, 20),
            "loss_function": trial.suggest_categorical(
                "loss_function", ("RMSE", "MAE")
            ),
        }
        reg = CatBoostRegressor(**param)
        reg.fit(self.x_train, self.y_train)
        val_accuracy = reg.score(self.x_test, self.y_test)
        return val_accuracy


class ModelTraining:
    def __init__(self, x_train, y_train, x_test, y_test) -> None:
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def decision_trees(self, fine_tuning=True):
        logging.info("Entered for training Decision Trees model")
        try:
            if fine_tuning:
                hyper_opt = Hyperparameters_Optimization(
                    self.x_train, self.y_train, self.x_test, self.y_test
                )
                study = optuna.create_study(direction="maximize")
                study.optimize(hyper_opt.optimize_decisiontrees, n_trials=100)
                trial = study.best_trial
                # criterion = trial.params["criterion"]
                max_depth = trial.params["max_depth"]
                min_samples_split = trial.params["min_samples_split"]
                print("Best parameters : ", trial.params)
                reg = DecisionTreeRegressor(
                    max_depth=max_depth, min_samples_split=min_samples_split,
                )
                reg.fit(self.x_train, self.y_train)
                return reg
            else:
                model = DecisionTreeRegressor(
                    criterion="squared_error", max_depth=7, min_samples_split=13
                )

                model.fit(self.x_train, self.y_train)
                return model
        except Exception as e:
            logging.error("Error in training Decision Trees model")
            logging.error(e)
            return None

    def random_forest(self, fine_tuning=True):
        logging.info("Entered for training Random Forest model")
        try:
            if fine_tuning:
                hyper_opt = Hyperparameters_Optimization(
                    self.x_train, self.y_train, self.x_test, self.y_test
                )
                study = optuna.create_study(direction="maximize")
                study.optimize(hyper_opt.optimize_randomforest, n_trials=100)
                trial = study.best_trial
                n_estimators = trial.params["n_estimators"]
                max_depth = trial.params["max_depth"]
                min_samples_split = trial.params["min_samples_split"]
                print("Best parameters : ", trial.params)
                reg = RandomForestRegressor(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    min_samples_split=min_samples_split,
                )
                reg.fit(self.x_train, self.y_train)
                return reg
            else:
                model = RandomForestRegressor(
                    n_estimators=152, max_depth=20, min_samples_split=17
                )
                model.fit(self.x_train, self.y_train)
                return model
        except Exception as e:
            logging.error("Error in training Random Forest model")
            logging.error(e)
            return None

    def adabooost_regressor(self, fine_tuning=True):
        logging.info("Entered for training Adaboost regressor model")
        try:
            if fine_tuning:
                hyper_opt = Hyperparameters_Optimization(
                    self.x_train, self.y_train, self.x_test, self.y_test
                )
                study = optuna.create_study(direction="maximize")
                study.optimize(hyper_opt.Optimize_Adaboost_regressor, n_trials=100)
                trial = study.best_trial
                n_estimators = trial.params["n_estimators"]
                learning_rate = trial.params["learning_rate"]
                reg = AdaBoostRegressor(
                    n_estimators=n_estimators, learning_rate=learning_rate
                )
                reg.fit(self.x_train, self.y_train)
                return reg
            else:
                model = AdaBoostRegressor(n_estimators=200, learning_rate=0.01)
                model.fit(self.x_train, self.y_train)
                return model
        except Exception as e:
            logging.error("Error in training Adaboost regressor model")
            logging.error(e)
            return None

    def LightGBM(self, fine_tuning=True):
        logging.info("Entered for training LightGBM model")
        try:
            if fine_tuning:
                hyper_opt = Hyperparameters_Optimization(
                    self.x_train, self.y_train, self.x_test, self.y_test
                )
                study = optuna.create_study(direction="maximize")
                study.optimize(hyper_opt.Optimize_LightGBM, n_trials=100)
                trial = study.best_trial
                n_estimators = trial.params["n_estimators"]
                max_depth = trial.params["max_depth"]
                learning_rate = trial.params["learning_rate"]
                reg = LGBMRegressor(
                    n_estimators=n_estimators,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                )
                reg.fit(self.x_train, self.y_train)
                return reg
            else:
                model = LGBMRegressor(
                    n_estimators=200, learning_rate=0.01, max_depth=20
                )
                model.fit(self.x_train, self.y_train)
                return model
        except Exception as e:
            logging.error("Error in training LightGBM model")
            logging.error(e)
            return None

    def xgboost(self, fine_tuning=True):
        logging.info("Entered for training XGBoost model")
        try:
            if fine_tuning:
                hy_opt = Hyperparameters_Optimization(
                    self.x_train, self.y_train, self.x_test, self.y_test
                )
                study = optuna.create_study(direction="maximize")
                study.optimize(hy_opt.Optimize_Xgboost_regressor, n_trials=100)
                trial = study.best_trial
                n_estimators = trial.params["n_estimators"]
                learning_rate = trial.params["learning_rate"]
                max_depth = trial.params["max_depth"]
                reg = xgb.XGBRegressor(
                    n_estimators=n_estimators,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                )
                reg.fit(self.x_train, self.y_train)
                return reg

            else:
                model = xgb.XGBRegressor(
                    n_estimators=200, learning_rate=0.01, max_depth=20
                )
                model.fit(self.x_train, self.y_train)
                return model
        except Exception as e:
            logging.error("Error in training XGBoost model")
            logging.error(e)
            return None

    def Catboost(self, fine_tuning=True):
        logging.info("Entered for training Catboost model")
        try:
            if fine_tuning:
                hy_opt = Hyperparameters_Optimization(
                    self.x_train, self.y_train, self.x_test, self.y_test
                )
                study = optuna.create_study(direction="maximize")
                study.optimize(hy_opt.Optimize_Catboost_regressor, n_trials=100)
                trial = study.best_trial
                iterations = trial.params["iterations"]
                depth = trial.params["depth"]
                l2_leaf_reg = trial.params["l2_leaf_reg"]
                learning_rate = trial.params["learning_rate"]
                logging.info("Best parameters : ", trial.params)
                reg = CatBoostRegressor(
                    iterations=iterations,
                    depth=depth,
                    l2_leaf_reg=l2_leaf_reg,
                    learning_rate=learning_rate,
                )
                reg.fit(self.x_train, self.y_train)
                return reg
            else:
                model = CatBoostRegressor(
                    iterations=200, depth=20, l2_leaf_reg=0.01, learning_rate=0.01
                )
                model.fit(self.x_train, self.y_train)
                return model
        except Exception as e:
            logging.error("Error in training Catboost model")
            logging.error(e)
            return None

    def stacking_regression(self):
        logging.info("Entered for stacking model")
        try:
            rf_tree = RandomForestRegressor(
                n_estimators=152, max_depth=20, min_samples_split=17
            )
            adaboost = AdaBoostRegressor(n_estimators=200, learning_rate=0.01)
            xgb_reg = XGBRegressor(n_estimators=200, learning_rate=0.01, max_depth=20)
            # cat_reg = CatBoostRegressor(
            #     iterations=200, depth=20, l2_leaf_reg=0.01, learning_rate=1e-7
            # )
            # lr = LinearRegression() # use random forest here
            reg = StackingRegressor(
                regressors=[rf_tree, adaboost, xgb_reg],
                meta_regressor=rf_tree,
            )
            reg.fit(self.x_train, self.y_train)
            return reg
        except Exception as e:
            logging.error("Error in stacking model")
            logging.error(e)
            return None


# if __name__ == "__main__":
#     data_utils = DataUtils()
#     jigsaw_data, c3_data, ruddit_data = data_utils.load_different_data()
#     Jigsaw, c3_data, ruddit_data = data_utils.prepare_data()
#     data = data_utils.concatenate_dfs(Jigsaw, c3_data, ruddit_data)
#     data = data.sample(100)
#     data_process = DataProcess(data, jigsaw_data, c3_data, ruddit_data)
#     data_process.check_null_values()
#     data_processed = data_process.apply_all_processing_on_train_test_data()
#     vectrorize = Vectorization(data)
#     Final_Training_data = vectrorize.vectorize()
#     data_val = DataValidation(Final_Training_data)
#     x_train, x_test, y_train, y_test = data_val.data_splitting()
#     print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

#     model_train = ModelTraining(x_train, y_train, x_test, y_test)
#     # ran_for = model_train.random_forest(fine_tuning=True)
#     # ada_for = model_train.adabooost_regressor(fine_tuning=True)
#     # lgbm_for = model_train.LightGBM(fine_tuning=True)
#     # xgb_for = model_train.xgboost(fine_tuning=True)
#     # catboost_for = model_train.Catboost(fine_tuning=True)
#     stack_for = model_train.stacking_regression()
#     print(stack_for)

In [None]:
import multiprocessing

from gensim.models import Word2Vec
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    #filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

def  train_gensim_model(train=True): 
    logging.info("Training the model") 
    try: 
        data_utils = DataUtils()
        jigsaw_data, c3_data, ruddit_data = data_utils.load_different_data()
        Jigsaw, c3_data, ruddit_data = data_utils.prepare_data()  
        data = data_utils.concatenate_dfs(Jigsaw, c3_data, ruddit_data) 
        
        data_process = DataProcess(data)  
        data_process.check_null_values()
        data_processed = data_process.apply_all_processing_on_train_test_data()  
        
        corpus = data_processed['text'] 
        # convert corpus to array 
        corpus = corpus.values 
        corpus = [normalize_document(doc) for doc in corpus] 
        
        w2v_model = Word2Vec(min_count=20,
                             window=40,
                             vector_size=300,
                             sample=6e-5, 
                             alpha=0.03, 
                             min_alpha=0.0007, 
                             negative=20, 
                             workers=cores-1
                             )

        w2v_model.build_vocab(corpus, progress_per=10000) 
        w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) 
        return w2v_model 
    except Exception as e: 
            logging.error(f"Error: {e}")
            raise e 

In [None]:
w2v_model = train_gensim_model()

In [None]:
print("Done")

In [None]:
import numpy as np 
import pandas as pd  
from gensim.models import FastText 

class WordEmbeddings: 
    def __init__(self) -> None:
        pass 
    
    def averaged_word2vec_vectorizer(self, corpus, model, num_features):
        vocabulary = set(model.wv.index_to_key)
        
        def average_word_vectors(words, model, vocabulary, num_features):
            feature_vector = np.zeros((num_features,), dtype="float64")
            nwords = 0.
            
            for word in words:
                if word in vocabulary: 
                    nwords = nwords + 1.
                    feature_vector = np.add(feature_vector, model.wv[word])
            if nwords:
                feature_vector = np.divide(feature_vector, nwords)

            return feature_vector

        features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                        for tokenized_sentence in corpus]
        return np.array(features) 
data_utils = DataUtils()
jigsaw_data, c3_data, ruddit_data = data_utils.load_different_data()
Jigsaw, c3_data, ruddit_data = data_utils.prepare_data()  
data = data_utils.concatenate_dfs(Jigsaw, c3_data, ruddit_data) 
        
data_process = DataProcess(data)  
data_process.check_null_values()
data_processed = data_process.apply_all_processing_on_train_test_data()  
        
corpus = data_processed['text'] 
        # convert corpus to array 
corpus = corpus.values 
corpus = [normalize_document(doc) for doc in corpus] 

prediction = WordEmbeddings()
doc_vecs_ft = prediction.averaged_word2vec_vectorizer(corpus, word2vec, 300)

In [None]:
from gensim.models import Word2Vec 
word2vec = Word2Vec.load("../input/word2vectrained/word2vec.model") 

In [None]:
w2v_model.save("word2vec.model")

In [None]:
df = pd.DataFrame(doc_vecs_ft)

In [None]:
import numpy as np 
import pandas as pd 
import logging

from pandas import core 
import joblib  
import nltk
import re
import numpy as np

stop_words = nltk.corpus.stopwords.words('english')

    #------------------------ data ingestion process--------------------
data_utils = DataUtils()
jigsaw_data, c3_data, ruddit_data = data_utils.load_different_data()
Jigsaw, c3_data, ruddit_data = data_utils.prepare_data()
data = data_utils.concatenate_dfs(Jigsaw, c3_data, ruddit_data)
    
    #------------------------- data preprocessing------------------------
df["y"] = data["y"]    
    #------------------------- Word Embeddings ----------------------------
    
data_val = DataValidation(df)
x_train, x_test, y_train, y_test = data_val.data_splitting()
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

    #-------------------------- model training------------------------------
model_train = ModelTraining(x_train, y_train, x_test, y_test)
lightgbm = model_train.stacking_regression()

In [None]:
print("Done")

In [None]:
comms_to_score = pd.read_csv(r"../input/jigsaw-toxic-severity-rating/comments_to_score.csv") 


data_process = DataProcess(comms_to_score)  
data_process.check_null_values()
data_processed = data_process.apply_all_processing_on_train_test_data()  
        
corpus = data_processed['text'] 
        # convert corpus to array 
corpus = corpus.values 
corpus = [normalize_document(doc) for doc in corpus] 

prediction = WordEmbeddings()
doc_vecs_ft_submit = prediction.averaged_word2vec_vectorizer(corpus, word2vec, 300)

In [None]:
dfs_submit = pd.DataFrame(doc_vecs_ft_submit) 
preds = lightgbm.predict(dfs_submit)

In [None]:
comms_to_score["score"] = preds
comms_to_score.drop("text", axis=1, inplace=True)

In [None]:
comms_to_score

In [None]:
comms_to_score.to_csv("submission.csv", index=False)