In [30]:
import pandas as pd


def load_data(file_path):
    if file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    elif file_path.endswith('.json'):
        data = pd.read_json(file_path)
    elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
        data = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format!")
    return data


file_path = r'C:\Users\91766\olympics2024.csv.xlsx'
data = load_data(file_path)
print(data.head(10))

   Rank        Country Country Code  Gold  Silver  Bronze  Total
0     1  United States           US    40      44      42    126
1     2          China          CHN    40      27      24     91
2     3          Japan          JPN    20      12      13     45
3     4      Australia          AUS    18      19      16     53
4     5         France          FRA    16      26      22     64
5     6    Netherlands          NED    15       7      12     34
6     7  Great Britain          GBG    14      22      29     65
7     8    South Korea          KOR    13       9      10     32
8     9          Italy          ITA    12      13      15     40
9    10        Germany          GER    12      13       8     33


In [36]:
def preprocess_data(data):
    # Handling missing values
    df = df.fillna(df.median(numeric_only=True)) 
    # Removing duplicates
    data = data.drop_duplicates()
    
     # Identifying categorical columns
    categorical_columns = data.select_dtypes(include=['object']).columns
    
    
   # Encoding categorical variables
    for col in categorical_columns:
        if data[col].nunique() < 10:  # Optional: Only encode columns with a small number of unique values
            data = pd.get_dummies(data, columns=[col], drop_first=True)  # drop_first=True to avoid multicollinearity
        else:
            data[col] = data[col].astype('category').cat.codes  # Label encoding for high cardinality
    
    
     # Removing outliers using the IQR method
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
   
    
    return data

In [38]:
data1 = preprocess_data(data)
print(data1.head(10))


    Rank  Country  Country Code  Gold  Silver  Bronze  Total
15    16       80            79     4       4       3     11
16    17       49            49     4       2       5     11
17    18       61            61     4       1       3      8
18    19       41            41     4       0       3      7
21    22       87            87     3       5       4     12
22    23       71            72     3       4       2      9
23    24       30            32     3       3       1      7
24    25        8             7     3       1       6     10
25    26       11            11     3       1       3      7
26    27       72            74     3       1       1      5


In [35]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rank          91 non-null     int64 
 1   Country       91 non-null     object
 2   Country Code  91 non-null     object
 3   Gold          91 non-null     int64 
 4   Silver        91 non-null     int64 
 5   Bronze        91 non-null     int64 
 6   Total         91 non-null     int64 
dtypes: int64(5), object(2)
memory usage: 5.1+ KB
None


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def run_linear_regression(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

def run_kmeans_clustering(X, n_clusters=3):
    model = KMeans(n_clusters=n_clusters)
    model.fit(X)
    return model

def run_decision_tree_classification(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return model, accuracy
    