# Chapter 2: Python Fundamentals, Machine Learning, and AWS Integration

## Part 1: Python Fundamentals

### 1.1 Setting Up Python

1. Visit https://www.python.org/downloads/
2. Download and install the latest version of Python for your operating system
3. Verify installation by opening a terminal/command prompt and typing:
   ```
   python --version
   ```

### 1.2 Python Basics

#### Variables and Data Types

In [None]:
x = 5  # integer
y = 3.14  # float
name = "Alice"  # string
is_student = True  # boolean

print(f"x: {x}, y: {y}, name: {name}, is_student: {is_student}")

#### Control Structures

##### If Statements

In [None]:
age = 20
if age < 18:
    print("Minor")
elif age >= 18 and age < 65:
    print("Adult")
else:
    print("Senior")

##### Loops

###### For Loops

In [None]:
# Iterating over a list
fruits = ["apple", "banana", "cherry"]
for fruit in fruits:
    print(fruit)

# Using range
for i in range(5):
    print(i)

# Iterating over a dictionary
person = {"name": "Bob", "age": 30, "city": "New York"}
for key, value in person.items():
    print(f"{key}: {value}")

###### While Loops

In [None]:
count = 0
while count < 5:
    print(count)
    count += 1

#### Functions

In [None]:
def greet(name):
    return f"Hello, {name}!"

print(greet("World"))

# Function with default parameter
def power(base, exponent=2):
    return base ** exponent

print(power(3))  # 9
print(power(3, 3))  # 27

### 1.3 Data Structures

#### Lists

In [None]:
fruits = ["apple", "banana", "cherry"]
print(fruits[0])  # apple
fruits.append("date")
print(fruits)  # ['apple', 'banana', 'cherry', 'date']
fruits.remove("banana")
print(fruits)  # ['apple', 'cherry', 'date']

#### Tuples

In [None]:
coordinates = (3, 4)
x, y = coordinates
print(f"x: {x}, y: {y}")

#### Dictionaries

In [None]:
person = {"name": "Alice", "age": 30, "city": "New York"}
print(person["name"])  # Alice
person["job"] = "Engineer"
print(person)

#### Sets

In [None]:
fruits = {"apple", "banana", "cherry"}
fruits.add("date")
print(fruits)
fruits.remove("banana")
print(fruits)

## Part 2: Python for Data Science and Machine Learning

### 2.1 Introduction to NumPy

First, install NumPy:

In [None]:
pip install numpy

Basic NumPy operations:

In [None]:
import numpy as np

# Create arrays
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([6, 7, 8, 9, 10])

print("Array 1:", arr1)
print("Array 2:", arr2)

# Basic operations
print("Sum:", arr1 + arr2)
print("Multiplication:", arr1 * arr2)

# Statistical operations
print("Mean of arr1:", np.mean(arr1))
print("Standard deviation of arr2:", np.std(arr2))

# Reshaping
matrix = arr1.reshape(5, 1)
print("Reshaped array:\n", matrix)

# Linear algebra
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
print("Matrix multiplication:\n", np.dot(A, B))

### 2.2 Introduction to Pandas

Install Pandas:

In [None]:
pip install pandas

Let's explore Pandas with more detailed examples:

In [None]:
import pandas as pd
import numpy as np

# Creating a DataFrame
df = pd.DataFrame({
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 32],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [50000, 60000, 55000, 75000]
})

print("Original DataFrame:")
print(df)

# Basic information about the DataFrame
print("\nDataFrame Info:")
df.info()

# Statistical summary
print("\nStatistical Summary:")
print(df.describe())

# Selecting columns
print("\nNames and Ages:")
print(df[['Name', 'Age']])

# Filtering rows
print("\nPeople older than 30:")
print(df[df['Age'] > 30])

# Adding a new column
df['Experience'] = [3, 8, 4, 10]
print("\nDataFrame with new 'Experience' column:")
print(df)

# Group by and aggregate
print("\nAverage Salary by City:")
print(df.groupby('City')['Salary'].mean())

# Sorting
print("\nSorted by Age (descending):")
print(df.sort_values('Age', ascending=False))

# Handling missing values
df.loc[1, 'Salary'] = np.nan
print("\nDataFrame with missing value:")
print(df)

print("\nDropping rows with missing values:")
print(df.dropna())

print("\nFilling missing values with mean:")
print(df.fillna(df.mean()))

# Date handling
df['Date'] = pd.date_range('2023-01-01', periods=4)
print("\nDataFrame with dates:")
print(df)

print("\nExtracting year from date:")
print(df['Date'].dt.year)

# Reading and writing data
df.to_csv('employees.csv', index=False)
df_read = pd.read_csv('employees.csv')
print("\nData read from CSV:")
print(df_read)

### 2.3 Introduction to Scikit-learn

Install Scikit-learn:

In [None]:
pip install scikit-learn

Let's explore Scikit-learn with more detailed examples:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris
import pandas as pd

# Load a dataset
iris = load_iris()
X = iris.data
y = iris.target

# Convert to DataFrame for better visualization
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
print("Iris Dataset:")
print(df.head())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("Classification Report:")
print(classification_report(y_test, lr_pred, target_names=iris.target_names))

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

print("\nDecision Tree Results:")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print("Classification Report:")
print(classification_report(y_test, dt_pred, target_names=iris.target_names))

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Classification Report:")
print(classification_report(y_test, rf_pred, target_names=iris.target_names))

# Feature Importance (for Random Forest)
feature_importance = pd.DataFrame({
    'feature': iris.feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

## Part 3: AWS Integration

### 3.1 Setting Up AWS SDK for Python (Boto3)

1. Open a terminal/command prompt
2. Install Boto3 using pip:
   ```
   pip install boto3
   ```
3. Create a new Python file named `aws_example.py`

### 3.2 Configuring AWS Credentials

1. Create a file named `credentials` in `~/.aws/` (Linux/Mac) or `C:\Users\YOUR_USERNAME\.aws\` (Windows)
2. Add your AWS credentials to this file:
   ```
   [default]
   aws_access_key_id = YOUR_ACCESS_KEY
   aws_secret_access_key = YOUR_SECRET_KEY
   ```
   Replace `YOUR_ACCESS_KEY` and `YOUR_SECRET_KEY` with your actual AWS credentials.

### 3.3 Basic AWS Operations with Python

#### Listing S3 Buckets

Add the following code to `aws_example.py`:

In [None]:
import boto3

# Create an S3 client
s3 = boto3.client('s3')

# List S3 buckets
response = s3.list_buckets()

print("S3 Buckets:")
for bucket in response['Buckets']:
    print(f"- {bucket['Name']}")

Run the script:

In [None]:
python aws_example.py

#### Uploading a File to S3

Add this function to `aws_example.py`:

In [None]:
def upload_file(file_name, bucket, object_name=None):
    if object_name is None:
        object_name = file_name

    s3_client = boto3.client('s3')
    try:
        s3_client.upload_file(file_name, bucket, object_name)
        print(f"File {file_name} uploaded successfully to {bucket}/{object_name}")
    except Exception as e:
        print(f"Error uploading file: {e}")
        return False
    return True

# Usage
upload_file('sample.txt', 'your-bucket-name')

Replace `'your-bucket-name'` with your actual bucket name.

#### Launching an EC2 Instance

Add this function to `aws_example.py`:

In [None]:
def launch_ec2_instance():
    ec2 = boto3.resource('ec2')
    
    instances = ec2.create_instances(
        ImageId='ami-0aa7d40eeae50c9a9',  # Amazon Linux 2 AMI ID, may vary by region
        MinCount=1,
        MaxCount=1,
        InstanceType='t2.micro',
        KeyName='your-key-pair-name'  # Replace with your key pair name
    )
    
    print(f"New instance created: {instances[0].id}")

# Usage
launch_ec2_instance()

Replace `'your-key-pair-name'` with the name of your EC2 key pair.

### 3.4 Using AWS Data Wrangler

Install AWS Data Wrangler:

In [None]:
pip install awswrangler

Let's explore some examples of using AWS Data Wrangler with various AWS services:

#### 3.4.1 Working with Amazon S3

In [None]:
import awswrangler as wr
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35]
})

# Write DataFrame to S3 as a CSV file
wr.s3.to_csv(
    df=df,
    path='s3://your-bucket-name/employees.csv'
)
print("Data written to S3")

# Read CSV from S3
df_read = wr.s3.read_csv('s3://your-bucket-name/employees.csv')
print("Data read from S3:")
print(df_read)

# List objects in an S3 bucket
objects = wr.s3.list_objects('s3://your-bucket-name/')
print("Objects in bucket:")
for obj in objects:
    print(obj)

#### 3.4.2 Working with Amazon Athena

In [None]:
# Write DataFrame to an AWS Glue Data Catalog / Amazon Athena
wr.s3.to_parquet(
    df=df,
    path='s3://your-bucket-name/employees_parquet/',
    dataset=True,
    database='your_database',
    table='employees'
)
print("Data written to Glue Data Catalog")

# Read data from Athena
df_athena = wr.athena.read_sql_query(
    "SELECT * FROM employees WHERE age > 30",
    database='your_database'
)
print("Data read from Athena:")
print(df_athena)

# Get the query execution details
query_execution_id = wr.athena.start_query_execution(
    sql="SELECT * FROM employees",
    database='your_database'
)
details = wr.athena.get_query_execution(query_execution_id)
print("Query execution details:", details)

#### 3.4.3 Working with AWS Glue

In [None]:
# List all databases in Glue Data Catalog
databases = wr.catalog.databases()
print("Glue databases:", databases)

# List all tables in a specific database
tables = wr.catalog.tables(database='your_database')
print("Tables in your_database:", tables)

# Get the schema of a specific table
schema = wr.catalog.table(database='your_database', table='employees')
print("Schema of employees table:", schema)

#### 3.4.4 Working with Amazon Redshift

In [None]:
# Assuming you have a Redshift cluster set up
connection_string = "redshift+psycopg2://username:password@host:port/database"

# Write DataFrame to Redshift
wr.redshift.to_sql(
    df=df,
    table='employees',
    schema='public',
    con=connection_string,
    mode='overwrite'
)
print("Data written to Redshift")

# Read data from Redshift
df_redshift = wr.redshift.read_sql_query(
    sql="SELECT * FROM public.employees",
    con=connection_string
)
print("Data read from Redshift:")
print(df_redshift)

#### 3.4.5 Working with Amazon QuickSight

In [None]:
# Create a QuickSight dataset from an Athena table
response = wr.quicksight.create_athena_dataset(
    name="EmployeesDataset",
    database='your_database',
    table='employees',
    account_id='your-aws-account-id',
    region='your-aws-region'
)
print("QuickSight dataset created:", response)

# List QuickSight datasets
datasets = wr.quicksight.list_datasets(account_id='your-aws-account-id')
print("QuickSight datasets:", datasets)

## Conclusion

In this comprehensive tutorial, we've covered:
1. Python fundamentals, including data structures and control flow
2. Introduction to machine learning libraries: NumPy, Pandas, and Scikit-learn
3. AWS integration using Boto3 and AWS Data Wrangler

We've explored how to use AWS Data Wrangler with various AWS services, including:
-