In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# Load the dataset
df = pd.read_csv('GSE272639_Apc_SI_series_raw_counts.csv.gz')

In [11]:
# Display the first few rows of the dataset
print(df.head())

  Gene_Symbol          ENSEMBL_ID  WT_Apc_KRas_LZR18181128_S4  \
0       Gnai3  ENSMUSG00000000001                        4372   
1       Cdc45  ENSMUSG00000000028                         537   
2         H19  ENSMUSG00000000031                         128   
3       Scml2  ENSMUSG00000000037                          23   
4        Apoh  ENSMUSG00000000049                           9   

   WT_Apc_KRas_LZR18181128_S2  WT_Apc_KRas_LZR18181128_S3  \
0                        4305                        4060   
1                         550                         426   
2                         107                          88   
3                          17                          18   
4                           2                           2   

   WT_Apc_KRas_LZR18181128_S6  WT_Apc_KRas_LZR18181128_S7  \
0                        5658                        4736   
1                         643                         475   
2                          76                         205  

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Handle missing values (example: fill with mean for numerical columns)
df.fillna(df.mean(), inplace=True)

In [None]:
# Encode categorical variables (example: PIK3CA mutation status)
df['PIK3CA_mutation'] = df['PIK3CA_mutation'].map({'Yes': 1, 'No': 0})

In [None]:
# Normalize numerical features (example: if there are numerical features)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['feature1', 'feature2']] = scaler.fit_transform(df[['feature1', 'feature2']])

In [None]:
# Data Visualization
# Bar plot for PIK3CA mutations across cancer types
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='cancer_type', hue='PIK3CA_mutation')
plt.title('Distribution of PIK3CA Mutations Across Cancer Types')
plt.xlabel('Cancer Type')
plt.ylabel('Count')
plt.legend(title='PIK3CA Mutation', loc='upper right', labels=['No', 'Yes'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Box plot for survival time based on PIK3CA mutation status
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='PIK3CA_mutation', y='survival_time')
plt.title('Survival Time Based on PIK3CA Mutation Status')
plt.xlabel('PIK3CA Mutation')
plt.ylabel('Survival Time (days)')
plt.xticks(ticks=[0, 1], labels=['No', 'Yes'])
plt.show()