# Materials Property Analysis - AWS Tier 2

This notebook demonstrates:
1. Uploading crystal structures to S3
2. Processing with Lambda to compute properties
3. Querying results from DynamoDB
4. Visualizing property distributions

**Prerequisites:**
- AWS credentials configured
- S3 bucket created
- DynamoDB table created
- Lambda function deployed

## Setup and Configuration

In [None]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from decimal import Decimal
import os

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

In [None]:
# Configuration
BUCKET_NAME = 'materials-data-YOUR-ID'  # Replace with your bucket name
DYNAMODB_TABLE = 'MaterialsProperties'
LAMBDA_FUNCTION = 'process-crystal-structure'
AWS_REGION = 'us-east-1'

# Initialize AWS clients
s3 = boto3.client('s3', region_name=AWS_REGION)
dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
lambda_client = boto3.client('lambda', region_name=AWS_REGION)

# Get table
table = dynamodb.Table(DYNAMODB_TABLE)

print(f"Bucket: {BUCKET_NAME}")
print(f"Table: {DYNAMODB_TABLE}")
print(f"Region: {AWS_REGION}")

## 1. Upload Crystal Structures to S3

In [None]:
# Create sample CIF file for testing
sample_cif = """
data_Si
_cell_length_a    3.867
_cell_length_b    3.867
_cell_length_c    3.867
_cell_angle_alpha 90.0
_cell_angle_beta  90.0
_cell_angle_gamma 90.0
_symmetry_space_group_name_H-M 'F d -3 m'
_symmetry_Int_Tables_number 227
loop_
_atom_site_label
_atom_site_type_symbol
_atom_site_fract_x
_atom_site_fract_y
_atom_site_fract_z
Si1 Si 0.0 0.0 0.0
Si2 Si 0.25 0.25 0.25
"""

# Save locally
Path('temp').mkdir(exist_ok=True)
with open('temp/Si.cif', 'w') as f:
    f.write(sample_cif)

# Upload to S3
s3.upload_file(
    'temp/Si.cif',
    BUCKET_NAME,
    'structures/Si.cif',
    ExtraArgs={'ContentType': 'chemical/x-cif'}
)

print("Uploaded Si.cif to S3")

In [None]:
# List uploaded structures
response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix='structures/')

if 'Contents' in response:
    print(f"Found {len(response['Contents'])} structures in S3:\n")
    for obj in response['Contents']:
        print(f"  {obj['Key']} ({obj['Size']/1024:.1f} KB)")
else:
    print("No structures found in S3")

## 2. Trigger Lambda Processing

In [None]:
# Manually invoke Lambda for Si.cif
payload = {
    'Records': [{
        's3': {
            'bucket': {'name': BUCKET_NAME},
            'object': {'key': 'structures/Si.cif'}
        }
    }]
}

response = lambda_client.invoke(
    FunctionName=LAMBDA_FUNCTION,
    InvocationType='RequestResponse',
    Payload=json.dumps(payload)
)

result = json.loads(response['Payload'].read())
print("Lambda Response:")
print(json.dumps(result, indent=2))

## 3. Query Results from DynamoDB

In [None]:
# Scan entire table
response = table.scan()
materials = response['Items']

# Handle pagination
while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    materials.extend(response['Items'])

print(f"Found {len(materials)} materials in DynamoDB")

In [None]:
# Convert to DataFrame
def decimal_to_float(obj):
    """Convert Decimal to float recursively."""
    if isinstance(obj, list):
        return [decimal_to_float(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: decimal_to_float(value) for key, value in obj.items()}
    elif isinstance(obj, Decimal):
        return float(obj)
    else:
        return obj

materials_clean = [decimal_to_float(mat) for mat in materials]
df = pd.DataFrame(materials_clean)

# Display first few entries
print("\nMaterials DataFrame:")
print(df.head())

In [None]:
# Get specific material
response = table.get_item(Key={'material_id': 'Si'})

if 'Item' in response:
    si_props = decimal_to_float(response['Item'])
    print("Silicon Properties:")
    print(json.dumps(si_props, indent=2))
else:
    print("Material not found")

## 4. Property Analysis and Visualization

In [None]:
# Summary statistics
if len(df) > 0:
    print("Summary Statistics:")
    print(df[['density', 'volume', 'num_atoms', 'lattice_a']].describe())
else:
    print("No materials to analyze yet. Upload more structures!")

In [None]:
# Density distribution
if len(df) > 0 and 'density' in df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    df['density'].hist(bins=20, edgecolor='black', ax=ax)
    ax.set_xlabel('Density (g/cm³)', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Distribution of Material Densities', fontsize=14)
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough data for density histogram")

In [None]:
# Volume vs Density scatter plot
if len(df) > 1 and 'volume' in df.columns and 'density' in df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(df['volume'], df['density'], s=100, alpha=0.6)
    
    # Annotate points
    for idx, row in df.iterrows():
        ax.annotate(
            row.get('formula', row.get('material_id', '')),
            (row['volume'], row['density']),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=9
        )
    
    ax.set_xlabel('Volume (Ų)', fontsize=12)
    ax.set_ylabel('Density (g/cm³)', fontsize=12)
    ax.set_title('Material Density vs Volume', fontsize=14)
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough data for scatter plot")

In [None]:
# Crystal system distribution
if len(df) > 0 and 'crystal_system' in df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    crystal_counts = df['crystal_system'].value_counts()
    crystal_counts.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
    ax.set_xlabel('Crystal System', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Distribution by Crystal System', fontsize=14)
    ax.tick_params(axis='x', rotation=45)
    ax.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough data for crystal system distribution")

In [None]:
# Space group distribution (if available)
if len(df) > 0 and 'space_group' in df.columns:
    fig, ax = plt.subplots(figsize=(12, 6))
    space_group_counts = df['space_group'].value_counts().head(10)
    space_group_counts.plot(kind='barh', ax=ax, color='coral', edgecolor='black')
    ax.set_xlabel('Count', fontsize=12)
    ax.set_ylabel('Space Group', fontsize=12)
    ax.set_title('Top 10 Space Groups', fontsize=14)
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Not enough data for space group distribution")

In [None]:
# Lattice parameter correlations
if len(df) > 2 and all(col in df.columns for col in ['lattice_a', 'lattice_b', 'lattice_c']):
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    axes[0].scatter(df['lattice_a'], df['lattice_b'], alpha=0.6)
    axes[0].set_xlabel('a (Å)', fontsize=11)
    axes[0].set_ylabel('b (Å)', fontsize=11)
    axes[0].set_title('a vs b')
    axes[0].grid(alpha=0.3)
    
    axes[1].scatter(df['lattice_b'], df['lattice_c'], alpha=0.6)
    axes[1].set_xlabel('b (Å)', fontsize=11)
    axes[1].set_ylabel('c (Å)', fontsize=11)
    axes[1].set_title('b vs c')
    axes[1].grid(alpha=0.3)
    
    axes[2].scatter(df['lattice_a'], df['lattice_c'], alpha=0.6)
    axes[2].set_xlabel('a (Å)', fontsize=11)
    axes[2].set_ylabel('c (Å)', fontsize=11)
    axes[2].set_title('a vs c')
    axes[2].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("Not enough data for lattice parameter correlations")

## 5. Query by Property Range

In [None]:
# Query materials with density between 2.0 and 5.0 g/cm³
from boto3.dynamodb.conditions import Attr

response = table.scan(
    FilterExpression=Attr('density').between(Decimal('2.0'), Decimal('5.0'))
)

filtered_materials = response['Items']
print(f"Found {len(filtered_materials)} materials with density 2.0-5.0 g/cm³")

if filtered_materials:
    filtered_df = pd.DataFrame([decimal_to_float(m) for m in filtered_materials])
    print("\nFiltered Materials:")
    print(filtered_df[['material_id', 'formula', 'density', 'volume']])

## 6. Export Results

In [None]:
# Export all materials to CSV
if len(df) > 0:
    output_file = 'materials_properties.csv'
    df.to_csv(output_file, index=False)
    print(f"Exported {len(df)} materials to {output_file}")
else:
    print("No materials to export")

In [None]:
# Generate summary report
if len(df) > 0:
    report = f"""
    Materials Property Analysis Report
    ===================================
    
    Total Materials: {len(df)}
    
    Density Statistics:
    - Mean: {df['density'].mean():.2f} g/cm³
    - Std Dev: {df['density'].std():.2f} g/cm³
    - Min: {df['density'].min():.2f} g/cm³
    - Max: {df['density'].max():.2f} g/cm³
    
    Volume Statistics:
    - Mean: {df['volume'].mean():.2f} Ų
    - Std Dev: {df['volume'].std():.2f} Ų
    - Min: {df['volume'].min():.2f} Ų
    - Max: {df['volume'].max():.2f} Ų
    
    Crystal Systems:
    {df['crystal_system'].value_counts().to_string() if 'crystal_system' in df.columns else 'N/A'}
    """
    
    print(report)
    
    # Save report
    with open('materials_report.txt', 'w') as f:
        f.write(report)
    print("\nReport saved to materials_report.txt")

## 7. Cleanup (Optional)

Run this section only when you're done with the analysis.

In [None]:
# Delete temporary files
import shutil
if Path('temp').exists():
    shutil.rmtree('temp')
    print("Deleted temporary files")

## Next Steps

1. **Upload more structures**: Use `scripts/upload_to_s3.py` to batch upload CIF files
2. **Analyze patterns**: Look for correlations between properties
3. **Machine learning**: Train models to predict properties
4. **Move to Tier 3**: Deploy production infrastructure with CloudFormation

For cleanup instructions, see `cleanup_guide.md`.