## ignoring warnings

In [1]:
import warnings
warnings.filterwarnings("ignore")

## Loading data

In [2]:
import torch
data = torch.load('../dataset/dataset/part_one_dataset/train_data/1_train_data.tar.pth')

## Analyzing data

In [3]:
print(data.keys())

dict_keys(['data', 'targets'])


In [4]:
print(data['data'].shape)
print(data['targets'].shape)

(2500, 32, 32, 3)
(2500,)


In [None]:
print(data['data'][0]

In [None]:
print(type(data['data']))

In [None]:
print(data['data'][0].shape)

In [None]:
print(type(data))

## Converting dict to df

In [None]:
print(type(data['targets']))
print(type(data['data']))

In [None]:
print(data['targets'].shape)
print(data['data'].shape)

In [None]:
df = pd.DataFrame(data['targets'])
df.columns = ['targets']
print(df.head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.countplot(x='targets', data=df)

# Display the plot
plt.title('Frequency of Values in Column1')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()

## Extracting features

In [None]:
import numpy as np
from tensorflow.keras import layers, models
from tensorflow.keras.models import Model

# Example data (replace this with your actual data)
X_train = data['data']  # 2500 images, 32x32x3 shape
y_train = data['targets']   # 10 classes (0-9)

# Create a CNN model for feature extraction
input_layer = layers.Input(shape=(32, 32, 3))

# Convolutional Layer 1
x = layers.Conv2D(32, (3, 3), activation='relu')(input_layer)
x = layers.MaxPooling2D((2, 2))(x)

# Convolutional Layer 2
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.MaxPooling2D((2, 2))(x)

# Flatten the output of the last convolutional layer
x = layers.Flatten()(x)

# Fully connected layers (optional for classification)
x = layers.Dense(128, activation='relu')(x)
output_layer = layers.Dense(10, activation='softmax')(x)

# Build the model
cnn_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Perform a dummy pass to initialize the model and its weights
cnn_model.predict(np.random.rand(1, 32, 32, 3))  # A dummy pass to initialize the model

# Now, remove the classification layers and only output the features
# Access the last convolutional layer (before flattening) for feature extraction
feature_extractor = Model(inputs=cnn_model.input, outputs=cnn_model.layers[2].output)  # The second conv layer is at index 2

# Extract features from the CNN model (this will be a 2D array of shape (2500, num_features))
features = feature_extractor.predict(X_train)

print(f"Extracted Features Shape: {features.shape}")  # Should print the shape like (2500, num_features)


In [None]:
features_flat = features.reshape(features.shape[0], -1)

# Now, 'features_flat' contains the extracted features
print(features_flat.shape) 

## Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA

# Assume features_flat is your flattened feature array from the pretrained model
# For example, features_flat.shape = (2500, 25088) for VGG16

# Initialize PCA to reduce the dimensionality
pca = PCA(n_components=0.95)  # Retain 95% of the variance
features_reduced = pca.fit_transform(features_flat)

print(features_reduced.shape)  

In [None]:
print(type(features_reduced))

In [None]:
print(features_reduced[0])

## Converting to Gaussian

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the reduced feature set (features_reduced)
features_standardized = scaler.fit_transform(features_reduced)

# Now, 'features_standardized' is standardized and ready for use

print(features_standardized.shape)

In [None]:
df_data = pd.DataFrame(features_standardized)

In [None]:

print(df.head())

In [None]:
df_concat = pd.concat([df_data,df],axis = 1)
print(df_concat.head())

In [None]:
print((features_standardized))

In [None]:
print(type(data['targets']))

## DO the lwp

In [None]:


y = data['targets']
X = features_standardized



In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Reshape
from sklearn.metrics import accuracy_score

# Example: X is your input data (already transformed, standardized, and linearized)
# y is your labels (integers from 0 to 9 for 10 classes)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape data to fit LSTM input requirements (LSTM expects 3D input)
# Here, we reshape X_train and X_test from shape (num_samples, num_features) to (num_samples, timesteps, features)
# We can consider each feature as a timestep, with 1 feature per timestep (timesteps=1).

X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))  # (num_samples, timesteps=1, num_features)
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))      # (num_samples, timesteps=1, num_features)

# Build the neural network model with an LSTM layer
model = Sequential()

# Input layer (since X has already been transformed, we start with the feature size directly)
model.add(LSTM(128, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), activation='relu', return_sequences=False))  # LSTM layer

# Dense layers
model.add(Dense(128, activation='relu'))  # Second hidden layer
model.add(Dense(64, activation='softmax'))  # Output layer with 10 classes

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=100, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Evaluate the model on the test set
y_pred_prob = model.predict(X_test_reshaped)
y_pred = np.argmax(y_pred_prob, axis=1)  # Get the predicted class (the class with the highest probability)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')


### Analyzing Images

In [None]:
from PIL import Image
import os
import tensorflow as tf

In [None]:
for i in range(2500):
    img = Image.fromarray(data['data'][i])
    target_dir = f"./{data['targets'][i]}"
    os.makedirs(target_dir, exist_ok=True)
    img.save(f"{target_dir}/img{i}.png")

### PCA

In [54]:
import tensorflow as tf
images = data['data']
# Enhanced CNN Model for feature extraction
model = tf.keras.Sequential([
    # First Convolutional Block
    tf.keras.layers.Conv2D(
        filters=4,  # Number of filters for learning low-level features
        kernel_size=(2, 2),  # Filter size
        strides=(1, 1),  # No down-sampling yet
        padding='same',  # Keeps output size same
        activation='relu',  # ReLU activation
        input_shape=(32, 32, 3)  # Input shape for RGB images
    ),
    tf.keras.layers.MaxPooling2D(
        pool_size=(2, 2),  # Reduce height and width by 2
        strides=(2, 2),
        padding='valid'  # Reduce size
    ),

    # Second Convolutional Block
    tf.keras.layers.Conv2D(
        filters=8,  # Double the filters for learning mid-level features
        kernel_size=(2, 2),
        strides=(1, 1),
        padding='same',
        activation='relu'
    ),
    tf.keras.layers.MaxPooling2D(
        pool_size=(2, 2),
        strides=(2, 2),
        padding='valid'
    ),

    # # Third Convolutional Block
    # tf.keras.layers.Conv2D(
    #     filters=8,  # Increase filters for high-level features
    #     kernel_size=(3, 3),
    #     strides=(2, 2),
    #     padding='same',
    #     activation='relu'
    # ),
    # tf.keras.layers.MaxPooling2D(
    #     pool_size=(2, 2),
    #     strides=(2, 2),
    #     padding='valid'
    # ),

    # # Fourth Convolutional Block
    # tf.keras.layers.Conv2D(
    #     filters=16,  # Increase further for richer high-level features
    #     kernel_size=(3, 3),
    #     strides=(2, 2),
    #     padding='same',
    #     activation='relu'
    # ),
    # tf.keras.layers.MaxPooling2D(
    #     pool_size=(2, 2),
    #     strides=(2, 2),
    #     padding='valid'
    # ),

    # Flatten the features to use later or pass through Dense layers
    tf.keras.layers.Flatten()
])

# Summary of the enhanced model
model.summary()

# Extract features using the enhanced model
features = model.predict(images)

# Print feature shape
print(f"Extracted features shape: {features.shape}")  # Example: (10, 256)


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Extracted features shape: (2500, 512)


In [36]:
from sklearn.decomposition import PCA

# Assume features_flat is your flattened feature array from the pretrained model
# For example, features_flat.shape = (2500, 25088) for VGG16

# Initialize PCA to reduce the dimensionality
pca = PCA(n_components=0.95)  # Retain 95% of the variance
features_reduced = pca.fit_transform(features)

print(features_reduced.shape)  

(2500, 224)


### Tsne


In [48]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [53]:
tsne = TSNE(n_components=4, random_state=42)
features_2d = tsne.fit_transform(features)

# Create a scatter plot where each class will have a different color
# scatter = plt.scatter(features_2d[:, 0], features_2d[:, 1], c=y, cmap='jet', s=50, alpha=0.7)

# # Add a color bar
# plt.colorbar(scatter, label='Class')

# # Set plot labels and title
# plt.xlabel('t-SNE Component 1')
# plt.ylabel('t-SNE Component 2')
# plt.title('t-SNE visualization of high-dimensional data')

# # Show the plot
# plt.show()

ValueError: 'n_components' should be inferior to 4 for the barnes_hut algorithm as it relies on quad-tree or oct-tree.

In [61]:
import umap
umap_model = umap.UMAP(n_components=20, random_state=42)
features_2d = umap_model.fit_transform(features)  # Apply UMAP


AttributeError: module 'umap' has no attribute 'UMAP'

In [55]:
X = features_2d
y = data['targets']

In [56]:

# Normalize the input data (optional but recommended)

print(X.shape)
print(y.shape)

(2500, 512)
(2500,)


In [57]:
print(X[1])

[ 69.677765     0.           0.         165.73352      0.
 176.1767      31.34619    127.75385     83.99743      0.
   0.         229.00185      0.         169.65599     14.078581
 155.83363     88.53352      0.           0.         244.74388
   0.         202.96416     40.917416   146.73787     69.33796
   0.           0.         224.71765      0.         201.28569
  67.99785    163.4591      62.88956      0.           0.
 229.5444       0.         183.10992     27.285147   161.21759
  44.857487     2.6172452    0.         120.701004     0.
  79.66397      3.984033    93.256294    27.527452     0.
   0.          81.49945      0.          72.94427     17.175558
  57.538002    14.803525     0.           0.          84.30511
   0.          54.622143     7.3381886   53.97275     93.98909
   0.           0.         193.22508      0.         126.08862
  41.226246    97.7383      38.62818      0.           0.
 201.33678      0.          96.200615     0.81942445 103.929756
  78.187996     0. 

In [58]:
print(X[0])

[ 13.862544    14.950963     0.          67.28831      0.
  89.52783     58.406456    61.56639     17.7618      10.0758
   0.         107.9237       0.         101.77234     52.808384
  80.089195    11.62138     10.526963     0.         103.072014
   0.         100.64986     55.43944     79.60229      6.350231
   6.716766     0.         104.08884      0.          94.10237
  46.45564     87.55301      5.4538713    7.3767576    0.
  97.75872      0.          99.92352     47.637486    78.77121
  15.598895    11.029854     0.         108.19276      0.
 108.25096     62.37186     87.24896     10.566191     7.516699
   0.         115.85992      0.         108.86381     57.064243
  90.218575     7.802604     0.           0.         125.28652
   0.          98.84879     42.627308    90.30014     15.9485445
  12.499206     0.          84.75455      0.          98.74079
  51.255264    82.23799      4.657741     7.2165494    0.
  74.47747      0.          92.197495    46.12319     78.0116
  15.32

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the input data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.35
