In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import pyodbc

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
path = "D:\\Dataset\\log_content\\"

In [11]:
df = spark.read.json(path + "20220401.json")

In [16]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- _index: string (nullable = true)
 |-- _score: long (nullable = true)
 |-- _source: struct (nullable = true)
 |    |-- AppName: string (nullable = true)
 |    |-- Contract: string (nullable = true)
 |    |-- Mac: string (nullable = true)
 |    |-- TotalDuration: long (nullable = true)
 |-- _type: string (nullable = true)



In [17]:
df = df.select("_source.AppName", "_source.Contract", "_source.Mac", "_source.TotalDuration")

In [18]:
df.show(10)

+-------+---------+------------+-------------+
|AppName| Contract|         Mac|TotalDuration|
+-------+---------+------------+-------------+
|  KPLUS|HNH579912|0C96E62FC55C|          254|
|  KPLUS|HUFD40665|CCEDDC333614|         1457|
|  KPLUS|HNH572635|B068E6A1C5F6|         2318|
|  KPLUS|HND141717|08674EE8D2C2|         1452|
|  KPLUS|HNH743103|402343C25D7D|          251|
|  KPLUS|HNH893773|B84DEE76D3B8|          924|
|  KPLUS|HND083642|B84DEE849A0F|         1444|
|  KPLUS|DNFD74404|90324BB44C39|          691|
|  KPLUS|DTFD21200|B84DEED27709|         1436|
|  KPLUS|LDFD05747|0C96E6C95E53|         1434|
+-------+---------+------------+-------------+
only showing top 10 rows



In [19]:
df = df.withColumn("Date", lit("2022-04-01"))

In [20]:
df.show(10)

+-------+---------+------------+-------------+----------+
|AppName| Contract|         Mac|TotalDuration|      Date|
+-------+---------+------------+-------------+----------+
|  KPLUS|HNH579912|0C96E62FC55C|          254|2022-04-01|
|  KPLUS|HUFD40665|CCEDDC333614|         1457|2022-04-01|
|  KPLUS|HNH572635|B068E6A1C5F6|         2318|2022-04-01|
|  KPLUS|HND141717|08674EE8D2C2|         1452|2022-04-01|
|  KPLUS|HNH743103|402343C25D7D|          251|2022-04-01|
|  KPLUS|HNH893773|B84DEE76D3B8|          924|2022-04-01|
|  KPLUS|HND083642|B84DEE849A0F|         1444|2022-04-01|
|  KPLUS|DNFD74404|90324BB44C39|          691|2022-04-01|
|  KPLUS|DTFD21200|B84DEED27709|         1436|2022-04-01|
|  KPLUS|LDFD05747|0C96E6C95E53|         1434|2022-04-01|
+-------+---------+------------+-------------+----------+
only showing top 10 rows



In [21]:
df.select("Mac").distinct().count()

1498169

In [22]:
df.select("AppName").distinct().show()

+-------+
|AppName|
+-------+
|  KPLUS|
|  RELAX|
|  CHILD|
|CHANNEL|
|    VOD|
|   FIMS|
|  SPORT|
+-------+



In [23]:
df.select("Contract").distinct().count()

1360622

In [24]:
stateDic = {'CHANNEL':'Truyền Hình','DSHD':'Truyền Hình', 'KPLUS':'Truyền Hình','VOD' : 'Phim truyen', 'FIMS': 'Phim truyen','SPORT': 'The thao', 'RELAX': 'Giai tri','CHILD': 'Thieu nhi'}

df = df.rdd.map(lambda x: (x.Contract, stateDic[x.AppName], x.Mac, x.TotalDuration, x.Date)).toDF(["Contract", "Type", "Mac", "TotalDuration", "Date"])

In [25]:
df.show()

+---------+-----------+------------+-------------+----------+
| Contract|       Type|         Mac|TotalDuration|      Date|
+---------+-----------+------------+-------------+----------+
|HNH579912|Truyền Hình|0C96E62FC55C|          254|2022-04-01|
|HUFD40665|Truyền Hình|CCEDDC333614|         1457|2022-04-01|
|HNH572635|Truyền Hình|B068E6A1C5F6|         2318|2022-04-01|
|HND141717|Truyền Hình|08674EE8D2C2|         1452|2022-04-01|
|HNH743103|Truyền Hình|402343C25D7D|          251|2022-04-01|
|HNH893773|Truyền Hình|B84DEE76D3B8|          924|2022-04-01|
|HND083642|Truyền Hình|B84DEE849A0F|         1444|2022-04-01|
|DNFD74404|Truyền Hình|90324BB44C39|          691|2022-04-01|
|DTFD21200|Truyền Hình|B84DEED27709|         1436|2022-04-01|
|LDFD05747|Truyền Hình|0C96E6C95E53|         1434|2022-04-01|
|HNH063566|Truyền Hình|B84DEEDD1C85|          687|2022-04-01|
|HNH866786|Truyền Hình|10394E2790A5|          248|2022-04-01|
|NBAAA1128|Truyền Hình|10394E47C1AF|          247|2022-04-01|
|HNH9604

In [26]:
df = df.select("Contract", "Type", "TotalDuration", "Date")

In [27]:
df.groupBy("Contract", "Type", "Date").agg(sum("TotalDuration").alias("TotalDuration")).show()

+---------+-----------+----------+-------------+
| Contract|       Type|      Date|TotalDuration|
+---------+-----------+----------+-------------+
|HNJ141458|Truyền Hình|2022-04-01|        47349|
|SGJ147370|Truyền Hình|2022-04-01|        83331|
|HNFD80728|Truyền Hình|2022-04-01|        91636|
|SGH574803|Truyền Hình|2022-04-01|       113492|
|NTFD92901|Truyền Hình|2022-04-01|        18944|
|LDD004076|Phim truyen|2022-04-01|        35454|
|HNJ051819|Phim truyen|2022-04-01|         6102|
|AGFD54302|Truyền Hình|2022-04-01|        11860|
|SGH919014|Truyền Hình|2022-04-01|        55429|
|NTFD63152|Truyền Hình|2022-04-01|        18886|
|HNH025464|Phim truyen|2022-04-01|        11888|
|SGH209300|Truyền Hình|2022-04-01|        25527|
|NND000787|Truyền Hình|2022-04-01|          175|
|VTFD15187|Phim truyen|2022-04-01|        35625|
|CBFD05588|Truyền Hình|2022-04-01|        62005|
|HTFD01984|Truyền Hình|2022-04-01|        81976|
|GLD014406|Truyền Hình|2022-04-01|        81937|
|DNAAA1260|Truyền Hì

In [28]:
df = df.groupBy("Contract", "Type", "Date").agg(sum("TotalDuration").alias("TotalDuration"))

In [29]:
df.count()

1483421

<b>Program</b>

In [30]:
import findspark

findspark.init()

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import pyodbc

In [32]:
spark = SparkSession.builder.getOrCreate()

In [33]:
def process_log_data(path, file_name):
    df = spark.read.json(path + file_name)

    df = df.select("_source.AppName", "_source.Contract", "_source.Mac", "_source.TotalDuration")

    df = df.withColumn('Type', when((col("AppName") == 'CHANNEL') | (col("AppName") =='DSHD') | (col("AppName") =='KPLUS') | (col("AppName") =='KPlus'), "Truyền Hình")
          .when((col("AppName") == 'VOD') | (col("AppName") == 'FIMS_RES') | (col("AppName") =='BHD_RES') | 
                 (col("AppName") =='VOD_RES') | (col("AppName") =='FIMS') | (col("AppName") =='BHD') | (col("AppName") =='DANET'), "Phim Truyện")
          .when((col("AppName") == 'RELAX'), "Giải Trí")
          .when((col("AppName") == 'CHILD'), "Thiếu Nhi")
          .when((col("AppName") == 'SPORT'), "Thể Thao")
          .otherwise("Error"))

    df = df.select("Contract", "Type", "TotalDuration")
    df = df.filter(df.Type != "Error")
    df = df.groupBy("Contract", "Type").agg(sum("TotalDuration").alias("TotalDuration"))
    df = df.groupBy("Contract").pivot("Type").sum("TotalDuration")
    
    return df

In [36]:
def main_task():
    path = "D:\\Dataset\\log_content\\"
    file_name = "20220401.json"
    
    df = process_log_data(path, file_name)

    for i in range(2, 31):
        if (i < 10):
            file_name = "2022040{}.json".format(i)
        else:
            file_name = "202204{}.json".format(i)

        df_temp = process_log_data(path, file_name)
        df = df.union(df_temp)
    
    df = df.groupBy("Contract").sum()
    
    # Save data
    df.repartition(1).write.csv('./DF_clean', header=True)
    return df

In [37]:
df = main_task()
# df = df.cache()

In [38]:
df.na.fill(0).show()

+---------+-------------+----------------+--------------+-------------+----------------+
| Contract|sum(Giải Trí)|sum(Phim Truyện)|sum(Thiếu Nhi)|sum(Thể Thao)|sum(Truyền Hình)|
+---------+-------------+----------------+--------------+-------------+----------------+
|AGAAA0848|            0|               0|             0|            0|           12141|
|AGAAA2588|            0|               0|             0|            0|         1078595|
|AGD003807|            0|          153369|           164|            0|            4352|
|AGD004253|            0|               0|             0|            0|          264972|
|AGD008179|        13225|          135699|             0|            0|          115222|
|AGD011212|            0|            3799|             0|            0|          308980|
|AGD022636|           35|               0|             0|            0|           26553|
|AGD026510|            0|               0|             0|            0|          676380|
|AGD029035|          

In [8]:
df = spark.read.csv("./DF_clean/*.csv", header=True)

In [9]:
df.show()

+---------+-------------+----------------+--------------+-------------+----------------+
| Contract|sum(Giải Trí)|sum(Phim Truyện)|sum(Thiếu Nhi)|sum(Thể Thao)|sum(Truyền Hình)|
+---------+-------------+----------------+--------------+-------------+----------------+
|AGAAA0848|         null|            null|          null|         null|           12141|
|AGAAA2588|         null|            null|          null|         null|         1078595|
|AGD003807|         null|          153369|           164|         null|            4352|
|AGD004253|         null|            null|          null|         null|          264972|
|AGD008179|        13225|          135699|          null|         null|          115222|
|AGD011212|         null|            3799|          null|         null|          308980|
|AGD022636|           35|            null|          null|         null|           26553|
|AGD026510|         null|            null|          null|         null|          676380|
|AGD029035|         n