In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import datetime


df = pd.read_csv("Assignment_DataScientist_20250502.csv")
print(f"Shape of the dataset: {df.shape}")
df.head()


In [None]:

df.info()
df.isnull().sum()


In [None]:

spectra_real = df.iloc[:, :56]
spectra_imag = df.iloc[:, 56:112]
spectra_all = pd.concat([spectra_real, spectra_imag], axis=1)

meta_cols = df.columns[112:]
metadata = df[meta_cols]


spectra_all.describe().T[['mean', 'std']].plot(kind='barh', figsize=(10, 15), title='Spectral Channels: Mean and Std')
plt.tight_layout()
plt.show()



In [None]:

targets = ['Brix', 'TA', 'Firmness (kg)']
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
for i, target in enumerate(targets):
    sns.histplot(metadata[target], kde=True, ax=axs[i])
    axs[i].set_title(f'{target} Distribution')
plt.tight_layout()
plt.show()



In [None]:

correlations = metadata[['sensorT', 'Size', 'Weight'] + targets].corr()
#plt.figure(figsize=(10, 6))
sns.heatmap(correlations, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(spectra_all)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df_pca = pd.concat([df_pca, metadata[targets]], axis=1)

#plt.figure(figsize=(5, 5))
sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='Brix', palette='viridis')
plt.title('PCA of Spectra Colored by Brix')
plt.show()


sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='TA', palette='plasma')
plt.title('PCA of Spectra Colored by TA')
plt.show()


sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='Firmness (kg)', palette='coolwarm')
plt.title('PCA of Spectra Colored by Firmness')
plt.show()




In [None]:

from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
df_tsne = pd.concat([df_tsne, metadata[targets]], axis=1)

#plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_tsne, x='TSNE1', y='TSNE2', hue='Firmness (kg)', palette='coolwarm')
plt.title("t-SNE of Spectra Colored by Firmness")
plt.show()


In [None]:

fruit_count = metadata.groupby(['Experiment', 'Fruit nr']).size()
print(f"Fruits with multiple spectra: {(fruit_count > 1).sum()} / {len(fruit_count)}")





Fruits with multiple spectra: 900 / 900


In [None]:

spectra_target_corr = pd.DataFrame()

for target in targets:
    corr = spectra_all.corrwith(metadata[target])
    spectra_target_corr[target] = corr

spectra_target_corr.index.name = 'Spectral Feature'
spectra_target_corr.head()



In [None]:

fig, axs = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

for i, target in enumerate(targets):
    axs[i].plot(spectra_target_corr.index, spectra_target_corr[target], label=f'{target} Correlation')
    axs[i].axhline(0, color='gray', linestyle='--')
    axs[i].set_ylabel("Correlation")
    axs[i].legend()
    axs[i].grid(True)

plt.xlabel("Spectral Feature Index (Real: 0–55, Imag: 56–111)")
plt.suptitle("Correlation Between Spectral Features and Targets")
plt.tight_layout()
plt.show()



In [None]:

top_features = {}
for target in targets:
    top_corr = spectra_target_corr[target].abs().sort_values(ascending=False).head(10)
    top_features[target] = top_corr
    print(f"\nTop correlated features for {target}:\n{top_corr}")



In [None]:

plt.figure(figsize=(15, 5))
for i, target in enumerate(targets):
    plt.subplot(1, 3, i+1)
    sns.boxplot(data=metadata, y=target)
    plt.title(f'Boxplot of {target} (with outliers)')
plt.tight_layout()
plt.show()


In [None]:

from scipy.stats import zscore

target_zscores = metadata[targets].apply(zscore)
outlier_mask = (np.abs(target_zscores) > 3).any(axis=1)

print(f"Number of outliers detected: {outlier_mask.sum()}")


In [None]:

df_clean = df[~outlier_mask].reset_index(drop=True)
print(f"Shape before outlier removal: {df.shape}")
print(f"Shape after outlier removal: {df_clean.shape}")


In [None]:

cleaned_metadata = df_clean.iloc[:, 112:]

plt.figure(figsize=(15, 5))
for i, target in enumerate(targets):
    plt.subplot(1, 3, i+1)
    sns.boxplot(data=cleaned_metadata, y=target)
    plt.title(f'Boxplot of {target} (after outlier removal)')
plt.tight_layout()
plt.show()
