In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
from matplotlib import pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/summer-olympics-weightlifting-records-2000-to-2020/to_csv_out.csv')

In [None]:
df = df[df['Bodyweight (kg)'] >= 30].reset_index(drop=True)

In [None]:
df.head()

In [None]:
len(df)

# Correlation between the bodyweight and records.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [None]:
def get_x_y_nparrary(df, x_name='Bodyweight (kg)', y_name='Snatch (kg)', filter_invalid=True, reshape=False):
    xs = []
    ys = []
    for index, row in df.iterrows():
        x = row[x_name]
        y = row[y_name]
        if x <= 0 or y <= 0:
            continue
        xs.append(x)
        ys.append(y)
    if reshape:
        return np.array(xs).reshape((-1, 1)), np.array(ys)
    return np.array(xs), np.array(ys)

In [None]:
def fit_and_add_plot(x, y, color, year, scatters, scatter_legends, linear, reshape, add_scatter=True, add_plot=True):
#     model = IsotonicRegression()
#     model = RandomForestRegressor()

#     model = LGBMRegressor()
    model = make_pipeline(PolynomialFeatures(3),LinearRegression())
    
    if linear:
        model = LinearRegression()
    model.fit(x, y)

    X_for_line = np.array(list(range(int(x.min()) - 5, int(x.max()) + 5, 1)))
    if reshape:
        X_for_line = np.array(list(range(int(x.min()) - 5, int(x.max()) + 5, 1))).reshape((-1, 1))
    Y_for_line = model.predict(X_for_line)

    if linear:
        print('%s: y = %.2fx + %.1f' % (str(year), model.coef_, model.intercept_))

    if add_plot:
        plt.plot(X_for_line, Y_for_line, color=color, linewidth=3)
    
    if add_scatter:
        sc = plt.scatter(x, y,  color=color)
        scatters.append(sc)
        scatter_legends.append(year)


def fit_and_plot_weight_vs_records_by_years(data, linear=False, all_years=False, reshape=False, record_type='Snatch (kg)', gender='Men', lowest_ranking=8, max_weight=1000, min_weight=0):
    df = data_filters(data, gender=gender, lowest_ranking=lowest_ranking, max_weight=max_weight, min_weight=min_weight)
    
    years = sorted(list(set(df['Year'])))
    min_year = years[0]
    max_year = years[-1]
    
    weights = sorted(list(set(df['Bodyweight (kg)'])))
    min_weight = weights[0]
    max_weight = weights[-1]

    title = '%s records from %d to %d for top %d %s athletes [%.1fkg~%.1fkg]' % (record_type, 
                                                                 min_year,
                                                                 max_year,
                                                                 lowest_ranking,
                                                                 gender,min_weight,max_weight)
    
    cmap = plt.get_cmap('gnuplot')
    colors = [cmap(i) for i in np.linspace(0, 1, len(years))]
    
    scatters = []
    scatter_legends = []

    if all_years:
        year = 'all year'
        color = 'blue'
        x, y = get_x_y_nparrary(df, reshape=reshape, y_name=record_type)
        fit_and_add_plot(x, y, color, year, scatters, scatter_legends, linear, reshape, add_scatter=True, add_plot=True)
    else:
        for i in range(len(years)):
            year = years[i]
            color = colors[i]
            x, y = get_x_y_nparrary(df[df['Year'] == year], reshape=reshape, y_name=record_type)
            fit_and_add_plot(x, y, color, year, scatters, scatter_legends, linear, reshape, add_scatter=True, add_plot=True)

    plt.legend(scatters,
       scatter_legends,
       scatterpoints=1,
       loc='best',
       ncol=3,
       fontsize=8)

    plt.title(title)
    plt.xlabel('Bodyweight (kg)')
    plt.ylabel('Weight (kg)')
    plt.show()
    
def fit_and_plot_weight_vs_records_all_years(data, record_type='Snatch (kg)', gender='Men', lowest_ranking=8):
    df = data_filters(data, gender=gender, lowest_ranking=lowest_ranking)
    years = sorted(list(set(df['Year'])))
    min_year = years[0]
    max_year = years[-1]
    
    title = '%s records from %d to %d for top %d %s athletes' % (record_type, 
                                                                 min_year,
                                                                max_year,
                                                                lowest_ranking,
                                                                gender)
    
    
    cmap = plt.get_cmap('gnuplot')
    colors = [cmap(i) for i in np.linspace(0, 1, len(years))]
    
    scatters = []
    scatter_legends = []
    
    year = 'all year'
    color = 'blue'
    x, y = get_x_y_nparrary(df, reshape=True, y_name=record_type)
    
    fit_and_add_plot(x, y, color, year, scatters, scatter_legends, linear=False, reshape=True, add_scatter=False, add_plot=True)
    
    for i in range(len(years)):
        year = years[i]
        color = colors[i]
        x, y = get_x_y_nparrary(df[df['Year'] == year], reshape=True, y_name=record_type)
        fit_and_add_plot(x, y, color, year, scatters, scatter_legends, linear=False, reshape=True, add_scatter=True, add_plot=False)

    plt.legend(scatters,
       scatter_legends,
       scatterpoints=1,
       loc='best',
       ncol=3,
       fontsize=8)
        
    plt.title(title)
    plt.xlabel('Bodyweight (kg)')
    plt.ylabel('Weight (kg)')
    plt.show()

In [None]:
def data_filters(df, gender='All', max_weight=1000, min_weight=0,
                 highest_ranking=1, lowest_ranking=100):
    if gender != 'All':
        df = df[df['Gender'] == gender]
    df = df[df['Bodyweight (kg)'] <= max_weight]
    df = df[df['Bodyweight (kg)'] >= min_weight]
    df = df[df['Ranking'] >= highest_ranking]
    df = df[df['Ranking'] <= lowest_ranking]
    return df

In [None]:
# fit_and_plot_weight_vs_records_by_years(best_3_men_data, reshape=True)
# fit_and_plot_weight_vs_records_by_years(best_3_men_data, all_years=True, reshape=True)
# fit_and_plot_weight_vs_records_by_years(best_3_men_data, linear=True, reshape=True)
# fit_and_plot_weight_vs_records_by_years(best_3_men_data, linear=True, all_years=True, reshape=True)
fit_and_plot_weight_vs_records_all_years(df, gender='Men', lowest_ranking=8)
fit_and_plot_weight_vs_records_all_years(df, gender='Women', lowest_ranking=8)
fit_and_plot_weight_vs_records_all_years(df, gender='Men', lowest_ranking=8, record_type='Clean & Jerk (kg)')
fit_and_plot_weight_vs_records_all_years(df, gender='Women', lowest_ranking=8, record_type='Clean & Jerk (kg)')
fit_and_plot_weight_vs_records_all_years(df, gender='Men', lowest_ranking=8, record_type='Total (kg)')
fit_and_plot_weight_vs_records_all_years(df, gender='Women', lowest_ranking=8, record_type='Total (kg)')

In [None]:
fit_and_plot_weight_vs_records_by_years(df, gender='Men', lowest_ranking=3, record_type='Total (kg)', reshape=True)
fit_and_plot_weight_vs_records_by_years(df, gender='Women', lowest_ranking=3, record_type='Total (kg)', reshape=True)

In [None]:
fit_and_plot_weight_vs_records_by_years(df, gender='Men', lowest_ranking=3, record_type='Total (kg)', reshape=True, max_weight=110)
fit_and_plot_weight_vs_records_by_years(df, gender='Women', lowest_ranking=3, record_type='Total (kg)', reshape=True, max_weight=88)