In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, mean, stddev

# Initialize Spark Session
spark = SparkSession.builder.appName("Big Data Analysis").getOrCreate()

# Load Dataset using Spark
df = spark.read.csv("adult.csv", header=True, inferSchema=True)

# Data Preprocessing
# Handling missing values
df = df.dropna()

# Convert categorical columns to numeric
for col_name in df.columns:
    if dict(df.dtypes)[col_name] == 'string':
        df = df.withColumn(col_name, col(col_name).cast("string"))

# Summary statistics
df.describe().show()

# Feature Engineering
numeric_features = [col_name for col_name, dtype in df.dtypes if dtype != 'string']
categorical_features = [col_name for col_name, dtype in df.dtypes if dtype == 'string']

# Aggregation and Grouping Example
df.groupBy("education").agg(count("income").alias("count"), mean("hoursPerWeek").alias("avg_hours")).show()

# Machine Learning with Spark MLlib
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Encode categorical columns
indexers = [StringIndexer(inputCol=col_name, outputCol=col_name+"_index").fit(df) for col_name in categorical_features]
for indexer in indexers:
    df = indexer.transform(df)

# Assemble Features
feature_cols = numeric_features + [col + "_index" for col in categorical_features]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Split Data
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Train Model
rf = RandomForestClassifier(featuresCol="features", labelCol="income_index", numTrees=100)
model = rf.fit(train_df)

# Predictions
predictions = model.transform(test_df)

# Evaluate Model
evaluator = MulticlassClassificationEvaluator(labelCol="income_index", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

# Show Sample Predictions
predictions.select("income_index", "prediction").show(10)

# Stop Spark Session
spark.stop()

ModuleNotFoundError: No module named 'pyspark'