In [None]:
import arff
import pandas as pd

# Load the ARFF file
data = list(arff.load('solar-flare_2.arff'))  # Convert generator to list

# Convert ARFF data to a pandas DataFrame
rows = [list(row) for row in data]
df = pd.DataFrame(rows)

df.to_csv('solar-flare_2.csv', index=False)

Description
Author: Gary Bradshaw
Source: UCI
Please cite:

Solar Flare database Relevant Information: -- The database contains 3 potential classes, one for the number of times a certain type of solar flare occured in a 24 hour period. -- Each instance represents captured features for 1 active region on the sun. -- The data are divided into two sections. The second section (flare.data2) has had much more error correction applied to the it, and has consequently been treated as more reliable.

Number of Instances: flare.data1: 323, flare.data2: 1066

Number of attributes: 13 (includes 3 class attributes)

Attribute Information
1. Code for class (modified Zurich class)  (A,B,C,D,E,F,H)
2. Code for largest spot size              (X,R,S,A,H,K)
3. Code for spot distribution              (X,O,I,C)
4. Activity                                (1 = reduced, 2 = unchanged)
5. Evolution                               (1 = decay, 2 = no growth, 
                                            3 = growth)
6. Previous 24 hour flare activity code    (1 = nothing as big as an M1,
                                            2 = one M1,
                                            3 = more activity than one M1)
7. Historically-complex                    (1 = Yes, 2 = No)
8. Did region become historically complex  (1 = yes, 2 = no) 
   on this pass across the sun's disk
9. Area                                    (1 = small, 2 = large)
10. Area of the largest spot (1 = <=5, 2 = >5)

From all these predictors three classes of flares are predicted, which are represented in the last three columns.

C-class flares production by this region Number
in the following 24 hours (common flares)
M-class flares production by this region Number in the following 24 hours (moderate flares)
X-class flares production by this region Number in the following 24 hours (severe flares)
CLASSTYPE: nominal CLASSINDEX: first

13 Features
Feature Name	Type	Distinct/Missing Values	Ontology
class (target)	nominal	6 distinct values
0 missing attributes	
largest_spot_size	nominal	6 distinct values
0 missing attributes	
spot_distribution	nominal	4 distinct values
0 missing attributes	
Activity	nominal	2 distinct values
0 missing attributes	
Evolution	nominal	3 distinct values
0 missing attributes	
Previous_24_hour_flare_activity_code	nominal	3 distinct values
0 missing attributes	
Historically-complex	nominal	2 distinct values
0 missing attributes	

In [None]:
# Converts the DF of strings to Integers
# Data Cleaning Set
df = df.apply(pd.to_numeric, errors='coerce').astype("Int64")

In [None]:
# drop last three columns
df = df.drop(df.columns[-3:], axis=1)

Part 2: Unsupervised Learning (Data Analysis and Pattern Recognition)
1. Objectives:
o Explore patterns using unsupervised methods.
o Test and compare various dimensionality reduction (≥5) and clustering (≥3)
techniques.
2. Deliverables:
o Preprocessing code tailored to the dataset.
o Implementation of DR and clustering methods.
o Annotated Jupyter Notebook with results and insights.
o Relate clustering results to real-world interpretability based on the dataset
domain.
o A 1-page summary discussing findings and comparisons.

In [None]:
# Five Dimensionality Reduction Techniques
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding

PCA_df = PCA(n_components=2).fit_transform(df)
TSNE_df = TSNE(n_components=2).fit_transform(df)
MDS_df = MDS(n_components=4).fit_transform(df)
Isomap_df = Isomap(n_components=2).fit_transform(df)
LLE_df = LocallyLinearEmbedding(n_components=2).fit_transform(df)


In [None]:
# Comparison of DR Techniques performance
from sklearn.cluster import KMeans

KMeans_PCA_df = KMeans(n_clusters=3).fit_predict(PCA_df)
KMeans_TSNE_df = KMeans(n_clusters=3).fit_predict(TSNE_df)
KMeans_MDS_df = KMeans(n_clusters=3).fit_predict(MDS_df)
KMeans_Isomap_df = KMeans(n_clusters=3).fit_predict(Isomap_df)
KMeans_LLE_df = KMeans(n_clusters=3).fit_predict(LLE_df)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(PCA_df[:, 0], PCA_df[:, 1], c=KMeans_PCA_df, s=50, cmap='viridis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(TSNE_df[:, 0], TSNE_df[:, 1], c=KMeans_TSNE_df, s=50, cmap='viridis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(MDS_df[:, 0], MDS_df[:, 1], c=KMeans_MDS_df, s=50, cmap='viridis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(Isomap_df[:, 0], Isomap_df[:, 1], c=KMeans_Isomap_df, s=50, cmap='viridis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(LLE_df[:, 0], LLE_df[:, 1], c=KMeans_LLE_df, s=50, cmap='viridis')
plt.show()

In [None]:
# Three Clustering Techniques
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN

KMeans_df = KMeans(n_clusters=3).fit_predict(PCA_df)
Agglomerative_df = AgglomerativeClustering(n_clusters=3).fit_predict(PCA_df)
DBSCAN_df = DBSCAN(eps=3, min_samples=2).fit_predict(PCA_df)

# Comparison of Clustering Techniques performance
# There exists three classes of flares, so the number of real identified clusters is 3


In [None]:
import matplotlib.pyplot as plt
plt.scatter(PCA_df[:, 0], PCA_df[:, 1], c=KMeans_df, s=50, cmap='viridis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(PCA_df[:, 0], PCA_df[:, 1], c=Agglomerative_df, s=50, cmap='viridis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.scatter(PCA_df[:, 0], PCA_df[:, 1], c=DBSCAN_df, s=50, cmap='viridis')
plt.show()

Part 3: Supervised Learning (Classification and/ or Regression)
1. Objectives:
o Develop and evaluate two models; either classification models or regression
models depending on your use case.
1. Some classification models could be Artificial Neural Networks (ANN)
and Support Vector Machines (SVM) [not limited to these].
2. Some regression models could be Random Forest or Gradient Boosting
[not limited to these].
3. Packages like scikit-learn have algorithms that you can use.
o Compare the effectiveness of the models using different preprocessing techniques
and hyperparameter tuning.
2. Deliverables:
o Preprocessing code specific to the supervised task.
o Implementations of two predictive models.
o Annotated Jupyter Notebook showcasing experiments and analyses.
o A 1-page summary of findings, including a performance comparison between the
models.


In [None]:
# reimport the csv
import arff
import pandas as pd

# Load the ARFF file
data = list(arff.load('solar-flare_2.arff'))  # Convert generator to list

# Convert ARFF data to a pandas DataFrame
rows = [list(row) for row in data]
df = pd.DataFrame(rows)

df.to_csv('solar-flare_2.csv', index=False)

In [None]:
df

In [65]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X = df.drop(df.columns[-3:], axis=1)
y = df[df.columns[-3:]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
from sklearn.neural_network import MLPRegressor

# Create a neural network model
model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [69]:
# Evaluate performance of regression
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse


0.15248073024397513

In [70]:
from sklearn.ensemble import RandomForestRegressor

# Create a random forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [71]:
# Evaluate performance of regression
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
mse

0.16605011290853222