# Meta Kaggle Overview - Kernels
Kaggle's public data on competitions, users, submission scores, and kernels

- https://www.kaggle.com/datasets/kaggle/meta-kaggle


In [1]:
%%html
<style type='text/css'>
.CodeMirror {
    font-size: 14px; 
    font-family: 'Droid Sans Mono';
}
</style>

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [3]:
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType
from pyspark.sql.functions import *

In [5]:
spark = (SparkSession
        .builder
        .appName("meta-kaggle-data-analysis")
        .config("spark.driver.memory", "4g")
        .config("spark.executor.instances", 4)
        .config("spark.executor.memory", "1g")
        .config("spark.executor.cores", 2)
        .getOrCreate()
        )

24/03/08 20:20:15 WARN Utils: Your hostname, blue resolves to a loopback address: 127.0.1.1; using 192.168.0.41 instead (on interface wlo1)
24/03/08 20:20:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/08 20:20:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark

****

In [7]:
data_files_path = "/home/rk/Desktop/data/kaggle-meta"

## Kernels

In [8]:
kernels_file = f"{data_files_path}/Kernels.csv"

In [9]:
kernels = spark.read.csv(kernels_file, header=True)
kernels.createOrReplaceTempView("kernels")

kernels.printSchema()

root
 |-- Id: string (nullable = true)
 |-- AuthorUserId: string (nullable = true)
 |-- CurrentKernelVersionId: string (nullable = true)
 |-- ForkParentKernelVersionId: string (nullable = true)
 |-- ForumTopicId: string (nullable = true)
 |-- FirstKernelVersionId: string (nullable = true)
 |-- CreationDate: string (nullable = true)
 |-- EvaluationDate: string (nullable = true)
 |-- MadePublicDate: string (nullable = true)
 |-- IsProjectLanguageTemplate: string (nullable = true)
 |-- CurrentUrlSlug: string (nullable = true)
 |-- Medal: string (nullable = true)
 |-- MedalAwardDate: string (nullable = true)
 |-- TotalViews: string (nullable = true)
 |-- TotalComments: string (nullable = true)
 |-- TotalVotes: string (nullable = true)



In [10]:
spark.sql("""
SELECT 
    * 
FROM 
    kernels 
LIMIT 2
""").toPandas()

Unnamed: 0,Id,AuthorUserId,CurrentKernelVersionId,ForkParentKernelVersionId,ForumTopicId,FirstKernelVersionId,CreationDate,EvaluationDate,MadePublicDate,IsProjectLanguageTemplate,CurrentUrlSlug,Medal,MedalAwardDate,TotalViews,TotalComments,TotalVotes
0,1,2505,205,,,1,03/25/2015 18:25:32,03/23/2018,03/25/2015,False,hello,,,383,0,0
1,2,3716,1748,,26670.0,2,03/25/2015 18:31:07,04/16/2015,03/25/2015,False,rf-proximity,3.0,07/15/2016,9128,1,13


24/03/08 20:20:34 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Users

In [11]:
users_file = f"{data_files_path}/Users.csv"
users = spark.read.csv(users_file, header=True)

users.createOrReplaceTempView("users")
# spark.sql("CACHE TABLE users")

users.printSchema()

root
 |-- Id: string (nullable = true)
 |-- UserName: string (nullable = true)
 |-- DisplayName: string (nullable = true)
 |-- RegisterDate: string (nullable = true)
 |-- PerformanceTier: string (nullable = true)



## Combine Kernels with Users

In [12]:
kernels_out = spark.sql("""
SELECT
    u.Id AS UserId
  , u.UserName
  , u.DisplayName
  , u.PerformanceTier
  , CONCAT("https://www.kaggle.com/code/", u.UserName, "/", k.CurrentUrlSlug) AS kernel_url
  , CASE
      WHEN LENGTH(k.CreationDate) >= 10 AND 
           regexp_extract(k.CreationDate, r"(\d{2}/\d{2}/\d{4})", 1) != "" AND
           regexp_extract(k.CreationDate, r"(\d{2}/\d{2}/\d{4})", 1) IS NOT NULL THEN
          to_date(SUBSTR(k.CreationDate, 1, 10), "MM/dd/yyyy")
      ELSE
          '2010-01-01'
    END AS EnabledDate
  , k.TotalViews
  , k.TotalComments
  , k.TotalVotes
  , k.Id AS kernelId
  , k.CurrentUrlSlug
FROM
    kernels k
  , users u
WHERE
    k.AuthorUserId = u.Id
""")

kernels_out.limit(3).toPandas()

                                                                                

Unnamed: 0,UserId,UserName,DisplayName,PerformanceTier,kernel_url,EnabledDate,TotalViews,TotalComments,TotalVotes,kernelId,CurrentUrlSlug
0,10001008,amauraster,JOSE MANUEL LOZANO,0,https://www.kaggle.com/code/amauraster/ok-cupid-clustering-profiles-with-k-means,2023-05-04,543,0,2,38125501,ok-cupid-clustering-profiles-with-k-means
1,10001008,amauraster,JOSE MANUEL LOZANO,0,https://www.kaggle.com/code/amauraster/predicting-diamonds-prices-with-ml-random-forest,2023-06-06,47,0,1,39595991,predicting-diamonds-prices-with-ml-random-forest
2,1000123,actright,Timur Letyagin,0,https://www.kaggle.com/code/actright/notebooka8f98d78ff,2017-04-01,167,0,0,225475,notebooka8f98d78ff


In [13]:
out_file = f"{data_files_path}/formatted/Kernels_fmt.csv"
kernels_out.orderBy("UserName").toPandas().to_csv(out_file, index=False)

                                                                                

In [14]:
spark.stop()