# Data Types

##  Structured data



### Tabular data

Data stored in databases, tables, with well-defined rows and columns, such as relational databases (SQL).



In [None]:
# Install necessary libraries
!pip install pandas eurostat pyjstat

In [None]:
import pandas as pd
import eurostat
import pyjstat

# Download data from Eurostat
# Example: Get GDP per capita for EU countries
data_eurostat = eurostat.get_data_df('nama_10_gdp')
data_eurostat.head()



### Time series data

Measurement results stored in a structured form, where the values ​​assigned to different times are present (e.g. stock prices, changes in sensor data over time).

In [None]:
import yfinance as yf

# Download NVIDIA stock data
nvidia = yf.download('NVDA', start='2023-01-01', end='2023-10-26')

# Print the downloaded data
nvidia

In [None]:
nvidia.info()

In [None]:
nvidia.describe()

In [None]:
nvidia.to_csv("nvidia.csv")

### Network data (Graph data)

Networks of connections, such as social media networks or route networks. Although in some cases it can be considered semi-structured, they often have a well-defined structure.

In [None]:
!pip install networkx matplotlib

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a sample graph
graph = nx.karate_club_graph()

# Visualize the graph
plt.figure(figsize=(8, 6))
nx.draw(graph, with_labels=True, node_color='skyblue', node_size=500, edge_color='gray')
plt.title("Karate Club Graph")
plt.show()

## Semi-structured data



### Document-based data

Data in XML, JSON format, which are structured at a certain level, but not in a rigid tabular format.

In [None]:
import json

# Sample JSON data as a string
json_data = """
{
  "name": "John Doe",
  "age": 30,
  "city": "New York"
}
"""

# Parse the JSON data into a Python dictionary
data = json.loads(json_data)

# Access data elements using keys
print(data["name"])  # Output: John Doe
print(data["age"])   # Output: 30

# Modify the dictionary
data["age"] = 31

# Convert the Python dictionary back to JSON format
new_json_data = json.dumps(data)

# Print the updated JSON data
print(new_json_data)

In [None]:
# Working with XML

import xml.etree.ElementTree as ET

# Sample XML data
xml_data = """
<person>
  <name>Jane Doe</name>
  <age>25</age>
  <city>London</city>
</person>
"""

# Parse XML data
root = ET.fromstring(xml_data)

# Access data
name = root.find("name").text
age = root.find("age").text
print(name)  # Output: Jane Doe
print(age)   # Output: 25

### Spatial (geospatial) data

GPS coordinates, map data. These are usually stored in a semi-structured format (e.g. GeoJSON, KML).

In [None]:
!pip install geopandas

In [None]:
# https://www.naturalearthdata.com/

import pandas as pd


import geopandas as gpd
import requests

# Download GeoJSON data from a public source (e.g., Natural Earth)
url = "https://d2ad6b4ur7yvpq.cloudfront.net/naturalearth-3.3.0/ne_110m_admin_0_countries.geojson"
response = requests.get(url)
geojson_data = response.json()

# Load the GeoJSON data into a GeoDataFrame
gdf = gpd.GeoDataFrame.from_features(geojson_data['features'])

# Print the first few rows of the GeoDataFrame
gdf.head()

In [None]:
# Plot the GeoDataFrame
gdf.plot()


### Sensor data

Although structured to some extent, it is often present in semi-structured form, especially when different types of sensors send different data or when different formats (e.g. CSV or JSON) are used.

In [None]:
# prompt: Generate me a simple code which show how can I manage the sensor data in python

# Sample sensor data (replace with your actual data)
sensor_data = [
    {"timestamp": "2023-10-27 10:00:00", "temperature": 25.5, "humidity": 60.2},
    {"timestamp": "2023-10-27 10:01:00", "temperature": 25.7, "humidity": 60.5},
    {"timestamp": "2023-10-27 10:02:00", "temperature": 25.9, "humidity": 60.8},
]

# Convert the list of dictionaries to a Pandas DataFrame for easier manipulation
import pandas as pd
df = pd.DataFrame(sensor_data)

# Print the DataFrame
print(df)

# You can now perform various operations on the DataFrame, such as:

# 1. Calculate the average temperature:
average_temperature = df['temperature'].mean()
print(f"Average temperature: {average_temperature}")

# 2. Filter data based on specific conditions:
high_humidity_data = df[df['humidity'] > 60.5]
print("Data with high humidity:")
print(high_humidity_data)

# 3. Plot the temperature over time:
import matplotlib.pyplot as plt
plt.plot(df['timestamp'], df['temperature'])
plt.xlabel('Timestamp')
plt.ylabel('Temperature')
plt.title('Temperature over Time')
plt.show()

# 4. Export the data to a CSV file:
df.to_csv('sensor_data.csv', index=False)



## Unstructured data

- Audio data: Voice recordings, music, sound effects, which usually do not contain a structure (e.g. MP3, WAV).
- Video data: Movies, moving image content (e.g. MP4, AVI), where the structure is minimal.

### Image data

Photos, images (e.g. JPG, PNG), which are usually stored in an unstructured form.

In [None]:
# prompt: Generate me a simple code wich show me how can i work with image in python

from PIL import Image
import requests
from io import BytesIO

# Download an image from a URL
image_url = "https://www.easygifanimator.net/images/samples/video-to-gif-sample.gif"  # Replace with your image URL
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))

# Display the image
img.show()

# Resize the image
new_width = 200
new_height = 150
resized_img = img.resize((new_width, new_height))
resized_img.show()

# Convert to grayscale
grayscale_img = img.convert("L")

# Save the image
resized_img.save("resized_image.jpg")  # Save as JPG, PNG, etc.

### Video data

Movies, moving image content (e.g. MP4, AVI), where the structure is minimal.

In [None]:
!pip install -U yt-dlp

In [None]:
!yt-dlp -f "bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

In [None]:
!pip install moviepy

In [None]:
from moviepy.editor import *

def extract_audio(video_file, audio_file):
  """
  Extracts the audio from an MP4 video file.

  Args:
    video_file: Path to the input MP4 video file.
    audio_file: Path to save the extracted audio file (e.g., 'output.mp3').
  """
  try:
    video = VideoFileClip(video_file)
    audio = video.audio
    audio.write_audiofile(audio_file)
    print(f"Audio extracted successfully to {audio_file}")
  except Exception as e:
    print(f"Error extracting audio: {e}")


# Example usage:
video_path = "Never Gonna Give You Up.mp4"  # Replace with the path to your MP4 video file
audio_path = "Never Gonna Give You Up.mp3"  # Replace with the desired output audio file path

extract_audio(video_path, audio_path)

In [None]:
import cv2
from tqdm import tqdm

resolution=24
output_folder = "Images"
vidcap = cv2.VideoCapture("Never Gonna Give You Up.mp4")
success, image = vidcap.read()

count = 0
for i in tqdm(range(0,240)):
    if count % resolution == 0:
        cv2.imwrite(output_folder + "/frame%d.jpg" % count, image)
        success,image = vidcap.read()
    else:
        success,image = vidcap.read()
    count += 1

### Text data

Content of articles, e-mails, chat messages, web pages. These are often completely unstructured and do not have a well-defined format.

In [None]:
!pip install datasets

In [None]:
# https://huggingface.co/

from datasets import load_dataset

ds = load_dataset("sileod/movie_recommendation")
ds

In [None]:
import pandas as pd

# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(ds['test'])

# Now 'df' is a Pandas DataFrame containing the data from the 'train' split of your dataset
df.head()

In [None]:
!pip install arxiv pymupdf langchain_openai langchain_experimental langchain-community

In [None]:
from langchain.document_loaders import ArxivLoader

documents = ArxivLoader(query="Business Stock prices Machine Learning Deep Learning", load_max_docs=10).load()

for doc in documents:
  print(f"Title: {doc.metadata['Title']}")  # Changed 'title' to 'Published'
  print(f"Summary: {doc.metadata['Summary']}") # Changed 'abstract' to 'Abstract'
  print("-" * 20)