In [21]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [22]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [4]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [5]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [26]:
import kaggle
import pandas as pd
import numpy as np 

from zipfile import ZipFile


from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [27]:
api.competitions_list(category='gettingStarted')

[contradictory-my-dear-watson,
 gan-getting-started,
 store-sales-time-series-forecasting,
 tpu-getting-started,
 digit-recognizer,
 titanic,
 house-prices-advanced-regression-techniques,
 connectx,
 nlp-getting-started,
 spaceship-titanic,
 facial-keypoints-detection,
 street-view-getting-started-with-julia,
 word2vec-nlp-tutorial,
 data-science-london-scikit-learn,
 just-the-basics-the-after-party,
 just-the-basics-strata-2013]

In [28]:
api.dataset_list()

[iamsouravbanerjee/world-population-dataset,
 pantanjali/unemployment-dataset,
 harshsingh2209/tesla-stock-pricing-20172022,
 thedevastator/airplane-crashes-and-fatalities,
 whenamancodes/student-performance,
 ariyoomotade/netflix-data-cleaning-analysis-and-visualization,
 whenamancodes/students-performance-in-exams,
 whenamancodes/violence-against-women-girls,
 whenamancodes/popular-movies-datasets-58000-movies,
 alexandrepetit881234/korean-demographics-20002022,
 whenamancodes/world-population-live-dataset,
 whenamancodes/netflix-prime-video-disney-hulu,
 thedevastator/mcdonalds-ice-cream-machines-broken-timeseries,
 deepcontractor/smoke-detection-dataset,
 thedevastator/weather-prediction,
 whenamancodes/data-science-fields-salary-categorization,
 moazzimalibhatti/co2-emission-by-countries-year-wise-17502022,
 sergylog/ab-test-data,
 advaypatil/youtube-statistics,
 nabilajahan/the-impact-of-electronic-gadget-uses]

In [33]:
api.dataset_list(search="road damage")

[alvarobasily/road-damage,
 psycon/ukraine-mariupol-damage-assessment,
 prudhvignv/road-damage-classification-and-assessment,
 rounak041993/traffic-violations-in-maryland-county,
 nichaoku/gbaccident0516,
 mohammedkuheil/roaddamagegan,
 trolololo888/potholes-and-road-damage-with-annotations,
 hotsonhonet/hackerearths-fast-furious-and-insured-challenge,
 thaddeussegura/eminem-lyrics-from-all-albums,
 gan2gan/austin-bicycle-crashes-from-20102017,
 pachriisk/great-britain-road-accidents,
 dariasvasileva/hourly-weather-data-in-ireland-from-24-stations,
 ghalibahmed2022/road-damage-detection,
 ninaflirp/nashville-vehicle-collision-dataset]

In [35]:
api.dataset_download_files("prudhvignv/road-damage-classification-and-assessment")

In [36]:
zf = ZipFile('road-damage-classification-and-assessment.zip')
zf.extractall('./data/') #save files in selected folder
zf.close()