<a href="https://colab.research.google.com/github/plus2net/Python-basics/blob/main/kaggle_dataset_1_downloading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle

Upload the kaggle.json file and run the code below to place it .kaggle directory.

In [None]:
from google.colab import files
files.upload()  # Choose kaggle.json when prompted

In [2]:
# Create the directory if it doesn't exist
!mkdir -p ~/.kaggle

# Move kaggle.json to the correct directory. Assumes kaggle.json is in the current working directory.
# If your kaggle.json is in a different location, please update the path below.
!mv kaggle.json ~/.kaggle/

# Set permissions for the kaggle.json file
!chmod 600 ~/.kaggle/kaggle.json

With the `kaggle.json` in place, let's try downloading the dataset.

In [3]:
# The original command to download the dataset
!kaggle datasets download -d yasserh/titanic-dataset

Dataset URL: https://www.kaggle.com/datasets/yasserh/titanic-dataset
License(s): CC0-1.0
Downloading titanic-dataset.zip to /content
  0% 0.00/22.0k [00:00<?, ?B/s]
100% 22.0k/22.0k [00:00<00:00, 77.4MB/s]


Now that the dataset is downloaded, we can remove the downloaded `.zip` file and any other metadata files that might have been downloaded alongside it to keep the environment clean.

In [4]:
# Unzip the downloaded dataset
!unzip -q titanic-dataset.zip

In [5]:
# Remove the zip file after extraction
!rm titanic-dataset.zip

# Remove other metadata files if they exist and are not needed
!rm -f titanic-dataset.zip.json

In [6]:
import pandas as pd

# List the extracted files to find the CSV file name
!ls

# Load the CSV file into a pandas DataFrame using the correct file name
df = pd.read_csv('Titanic-Dataset.csv')

# Display the first 5 rows of the DataFrame
display(df.head())

sample_data  Titanic-Dataset.csv


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


What is the total number of rows and columns ?

In [7]:
# Get the total number of rows and columns
num_rows, num_cols = df.shape

print(f"Total number of rows: {num_rows}")
print(f"Total number of columns: {num_cols}")

Total number of rows: 891
Total number of columns: 12


Part II : Create SQLite database.

In [8]:
import pandas as pd
import sqlite3

# Name of the CSV file to convert
csv_file = 'Titanic-Dataset.csv'
# Name of the SQLite database file to create
db_file = 'titanic.db'
# Name of the table within the SQLite database
table_name = 'titanic_data'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

# Create an SQLite database connection
conn = sqlite3.connect(db_file)

# Write the DataFrame to an SQLite table
df.to_sql(table_name, conn, if_exists='replace', index=False)

# Close the connection
conn.close()

print(f"Successfully converted '{csv_file}' to SQLite database '{db_file}' with table '{table_name}'.")

Successfully converted 'Titanic-Dataset.csv' to SQLite database 'titanic.db' with table 'titanic_data'.


To verify the conversion, let's connect to the new SQLite database and display the first few rows of the table:

In [9]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('titanic.db')

# Read the data from the table into a pandas DataFrame
db_df = pd.read_sql_query("SELECT * FROM titanic_data LIMIT 5;", conn)

# Display the DataFrame
display(db_df)

# Close the connection
conn.close()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
import pandas as pd

# Name of the CSV file to convert
csv_file = 'Titanic-Dataset.csv'
# Name of the JSON file to create
json_file = 'titanic.json'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

# Convert the DataFrame to a JSON file
df.to_json(json_file, orient='records', indent=4)

print(f"Successfully converted '{csv_file}' to JSON file '{json_file}'.")

Successfully converted 'Titanic-Dataset.csv' to JSON file 'titanic.json'.


To verify the conversion, let's display the first few lines of the generated JSON file:

In [11]:
import json

# Read the first few lines of the JSON file
with open('titanic.json', 'r') as f:
    for i, line in enumerate(f):
        if i < 15:  # Displaying first 15 lines for brevity
            print(line.strip())
        else:
            break

[
{
"PassengerId":1,
"Survived":0,
"Pclass":3,
"Name":"Braund, Mr. Owen Harris",
"Sex":"male",
"Age":22.0,
"SibSp":1,
"Parch":0,
"Ticket":"A\/5 21171",
"Fare":7.25,
"Cabin":null,
"Embarked":"S"
},


In [12]:
import pandas as pd
import xml.etree.ElementTree as ET

# Name of the CSV file to convert
csv_file = 'Titanic-Dataset.csv'
# Name of the XML file to create
xml_file = 'titanic.xml'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

# Create the root element for the XML
root = ET.Element('TitanicData')

# Iterate over DataFrame rows and add them to the XML structure
for index, row in df.iterrows():
    record = ET.SubElement(root, 'Record')
    for col_name, value in row.items():
        child = ET.SubElement(record, col_name)
        # Convert NaN to empty string for XML representation, or any other desired handling
        child.text = str(value) if pd.notna(value) else ''

# Create an ElementTree object and write it to a file
tree = ET.ElementTree(root)

# Use pretty_print for better readability
from xml.dom import minidom
xml_string = minidom.parseString(ET.tostring(root)).toprettyxml(indent="    ")

with open(xml_file, "w", encoding="utf-8") as f:
    f.write(xml_string)

print(f"Successfully converted '{csv_file}' to XML file '{xml_file}'.")

Successfully converted 'Titanic-Dataset.csv' to XML file 'titanic.xml'.


To verify the conversion, let's display the first few lines of the generated XML file:

In [None]:
import os

# Read the first few lines of the XML file
# We'll read more lines than JSON due to XML's verbose structure
num_lines_to_display = 30

if os.path.exists('titanic.xml'):
    with open('titanic.xml', 'r') as f:
        for i, line in enumerate(f):
            if i < num_lines_to_display:
                print(line.strip())
            else:
                break
else:
    print("Error: 'titanic.xml' not found.")