In [1]:
from collections import namedtuple 
import shutil, time, re
from src.config.spark_manager import spark_session
from src.constants.training_pipeline import *
from src.components.data_validation import add_mean_indicator_col_per_user
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.pipeline import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.sql.functions import lit, col, DataFrame
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/08 08:50:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/08 08:50:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
def read_downloaded_data(paths:list):
        try:
            #logging.info("Entered read_downloaded_data method")
            for i, path in enumerate(paths):
                file_list = os.listdir(path)
                for j,file in enumerate(file_list):
                    file_path = os.path.join(path, file)
                    user = file.split(sep='.')[0]
                    user_type = re.split('/', path)[-1]
                    temp_df = spark_session.read.csv(file_path, header=True, inferSchema=True)
                    temp_df = temp_df.withColumn(USER_COLUMN_NAME, lit(f"{user}_{user_type}"))
                    if j == 0:
                        temp_df1 = temp_df
                    else:
                        temp_df1 = temp_df1.union(temp_df)
                temp_df1 = temp_df1.withColumn(TARGET_COLUMN_NAME, lit(f"{user_type}"))
                if i == 0:    
                    temp_df2 = temp_df1
                else:
                    temp_df2 = temp_df2.union(temp_df1)
            #logging.info(f"reading of CSV is done")
            return temp_df2     
        except Exception as e:
            #logging.error(e)
            print(e)

In [3]:
df = read_downloaded_data(['../user_downloaded_data/UBE', '../user_downloaded_data/UGE/'])

In [4]:
df.count()

134764

In [5]:
for column in INDICATOR_COLS:
    df = df.filter(col(column) < INDICATOR_THRESHOLD)
df = add_mean_indicator_col_per_user(df, USER_COLUMN_NAME, INDICATOR_COLS)
df = df.drop(*COLS_TO_BE_REMOVED)

                                                                                

In [6]:
[df.count(), len(df.columns)]

[126632, 19]

In [18]:
def prepare_train_test_data(data: DataFrame, train_percentage:float, 
                                categorical_cols: list)-> DataFrame:
        try:
            #logging.info("Entered prepare_train_test_data method")
            train, test = data.randomSplit([train_percentage, 1 - train_percentage], seed=43)
            #train, test = data.randomSplit([train_percentage, 1 - train_percentage])
            empty_rdd = spark_session.sparkContext.emptyRDD()
            temp_df_1 = spark_session.createDataFrame(empty_rdd, schema=train.schema)
            for column in categorical_cols:
                cat_train_df = train.select(col(column))
                cat_test_df = test.select(col(column))
                df_diff = cat_test_df.subtract(cat_train_df).collect()
                print(f"column {column} in test dataset has {len(df_diff)} values not present in train dataset")
                if len(df_diff) > 0:
                    for row in df_diff:
                        temp_df = test.where(col(column) == row[column]).dropDuplicates([column])
                        temp_df_1 = temp_df_1.union(temp_df)
            if temp_df_1.count() > 0:
                train = train.union(temp_df_1)
            #logging.info(f"train and test split done. train count is {train.count()}, test count is {test.count()}")
            return train, test
        except Exception as e:
            #logging.error(e)
            print(e)


In [4]:
df = spark_session.read.csv('./final_data2.csv', inferSchema=True, header=True)

                                                                                

In [11]:
df.select(*['day', 'user']).groupBy(['day', 'user']).count().show()



+----------+-----+-----+
|       day| user|count|
+----------+-----+-----+
|2021-06-16|152.0| 3803|
|2021-06-12|151.0|  844|
|2021-06-14|153.0| 1635|
|2021-06-15|152.0| 3485|
|2021-06-12|152.0| 3777|
|2021-06-14|151.0| 1037|
|2021-06-10|153.0|  220|
|2021-06-14|150.0|  402|
|2021-06-15|151.0| 1789|
|2021-06-12|154.0| 6236|
|2021-06-10|150.0|  165|
|2021-06-12|153.0| 1132|
|2021-06-12|150.0|  549|
|2021-06-13|153.0| 1443|
|2021-06-13|150.0|  573|
|2021-06-13|152.0| 3856|
|2021-06-14|154.0| 4100|
|2021-06-16|153.0| 1209|
|2021-06-10|151.0|  726|
|2021-06-10|154.0|  210|
+----------+-----+-----+
only showing top 20 rows



                                                                                