# Meta Kaggle Overview
Kaggle's public data on competitions, users, submission scores, and kernels

- https://www.kaggle.com/datasets/kaggle/meta-kaggle


In [1]:
%%html
<style type='text/css'>
.CodeMirror {
    font-size: 14px; 
    font-family: 'Droid Sans Mono';
}
</style>

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [3]:
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import *

In [6]:
spark = (SparkSession
        .builder
        .appName("meta-kaggle-data-analysis")
        .getOrCreate()
        )

24/03/08 19:46:06 WARN Utils: Your hostname, blue resolves to a loopback address: 127.0.1.1; using 192.168.0.41 instead (on interface wlo1)
24/03/08 19:46:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/08 19:46:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
spark

In [8]:
data_files_path = "/home/rk/Desktop/data/kaggle-meta"

In [9]:
!ls -lhS $data_files_path

total 28G
-rw-rw-r-- 1 rk rk  13G Mar  5 02:12 EpisodeAgents.csv
-rw-rw-r-- 1 rk rk 4.8G Mar  5 02:38 UserAchievements.csv
-rw-rw-r-- 1 rk rk 3.5G Mar  5 02:30 Episodes.csv
-rw-rw-r-- 1 rk rk 1.7G Mar  5 02:36 Submissions.csv
-rw-rw-r-- 1 rk rk 1.5G Mar  5 02:34 KernelVersions.csv
-rw-rw-r-- 1 rk rk 953M Mar  5 02:41 Users.csv
-rw-rw-r-- 1 rk rk 869M Mar  5 02:09 DatasetVersions.csv
-rw-rw-r-- 1 rk rk 746M Mar  5 02:32 ForumMessages.csv
-rw-rw-r-- 1 rk rk 549M Mar  5 02:38 Teams.csv
-rw-rw-r-- 1 rk rk 303M Mar  5 02:38 TeamMemberships.csv
-rw-rw-r-- 1 rk rk 236M Mar  5 02:34 KernelVersionDatasetSources.csv
-rw-rw-r-- 1 rk rk 188M Mar  5 02:36 KernelVotes.csv
-rw-rw-r-- 1 rk rk 165M Mar  5 02:36 Kernels.csv
-rw-rw-r-- 1 rk rk 149M Mar  5 02:32 ForumMessageVotes.csv
-rw-rw-r-- 1 rk rk  75M Mar  5 02:11 DatasetVotes.csv
-rw-rw-r-- 1 rk rk  71M Mar  5 02:33 KernelVersionCompetitionSources.csv
-rw-rw-r-- 1 rk rk  58M Mar  5 02:41 UserFollowers.csv
-rw-rw-r-- 1 rk rk  52M M

- `EpisodeAgents.csv` is the biggest file (~3.5GB)
- There are 4 more files that are bigger.

****

## Gather column names of all datasets

In [10]:
import os
import csv

In [11]:
schemas = dict()

# Iterate through CSV files in the directory
for filename in os.listdir(data_files_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(data_files_path, filename)
        
        with open(file_path, 'r') as file:
            reader = csv.reader(file)
            headers = next(reader)  # Get the header row
            
            schemas[filename] = headers
            
print("All Datasets & Column names")

for filename, schema in schemas.items():
    print(filename)
    print(schema)
    print("")

All Datasets & Column names
CompetitionTags.csv
['Id', 'CompetitionId', 'TagId']

Datasets.csv
['Id', 'CreatorUserId', 'OwnerUserId', 'OwnerOrganizationId', 'CurrentDatasetVersionId', 'CurrentDatasourceVersionId', 'ForumId', 'Type', 'CreationDate', 'LastActivityDate', 'TotalViews', 'TotalDownloads', 'TotalVotes', 'TotalKernels']

ForumMessageVotes.csv
['Id', 'ForumMessageId', 'FromUserId', 'ToUserId', 'VoteDate']

EpisodeAgents.csv
['Id', 'EpisodeId', 'Index', 'Reward', 'State', 'SubmissionId', 'InitialConfidence', 'InitialScore', 'UpdatedConfidence', 'UpdatedScore']

KernelVersionDatasetSources.csv
['Id', 'KernelVersionId', 'SourceDatasetVersionId']

Episodes.csv
['Id', 'Type', 'CompetitionId', 'CreateTime', 'EndTime']

KernelLanguages.csv
['Id', 'Name', 'DisplayName', 'IsNotebook']

Tags.csv
['Id', 'ParentTagId', 'Name', 'Slug', 'FullPath', 'Description', 'DatasetCount', 'CompetitionCount', 'KernelCount']

Competitions.csv
['Id', 'Slug', 'Title', 'Subtitle', 'HostSegmentTitle', 'Foru

24/03/08 19:46:24 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


****

In [12]:
spark.stop()