In [None]:
# TODO
# Első fontos dolog:
# User-szintű statisztikákat csinálni.
#  Percent átlag, medián,
#  Delta átlag, medián, tendencia (egyértelműen növekvő, egyértelműen csökkenő, hullámzó, random - ebben az esetben end - start érték)
#  Egyéni szintre a populációtól való eltérés legyen - pl.intenzitásban, változékonyság, ebből lehet esetleg klasztert csinálni
#  Esetleg kitalálni a hiányzó weekek értékeit - a két meglévőből számított heti átlag eltérés alapján csak be-beszúrni.
#  Három oszlopban az utolsó, utolsó előtti és azelőtti nap értéke, meg ezekhez való statisztikák.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

!pip install pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('pulmonary_fibrosis').getOrCreate()

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import expr, col, lag, avg as sparkavg, min as sparkmin, max as sparkmax, stddev, skewness, count, kurtosis, round as sparkround, first, last, when

In [None]:
df_train = spark.read.options(header='true', inferSchema='true').csv("/kaggle/input/osic-pulmonary-fibrosis-progression/train.csv")
df_test = spark.read.options(header='true', inferSchema='true').csv("/kaggle/input/osic-pulmonary-fibrosis-progression/test.csv")

In [None]:
weeksWindow = Window.partitionBy('Patient').orderBy('Weeks')
weeksDescWindow = Window.partitionBy('Patient').orderBy(col('Weeks').desc())

def enrichWithWindow(dataframe):
    return dataframe \
    .withColumn('weekLag', lag('Weeks').over(weeksWindow)) \
    .withColumn('percentLag', lag('Percent').over(weeksWindow)) \
    .withColumn('weekDiff', col('Weeks') - col('weekLag')) \
    .withColumn('percentDiff', col('Percent') - col('percentLag')) \
    .drop('weekLag').drop('percentLag') \
    .withColumn('percentDeltaPerWeek', col('percentDiff')/col('weekDiff'))

dfTrain = enrichWithWindow(df_train)
dfTrain.persist()
dfTrain.show(1)

In [None]:
from pyspark.sql import DataFrameStatFunctions as sparkstat

percentMedian = dfTrain.select('Percent').approxQuantile('Percent', [0.5], 0)
avgOfPercents = dfTrain.selectExpr('avg(Percent) avgOfPercents').collect()

print(f'Median of percents is: {percentMedian[0]}')
print(f'Average of percents is: {avgOfPercents[0].avgOfPercents}')

In [None]:
# Calculate statistics about the average percent delta per week value.
# These statistics show something about the trends of the deltas.

def generateCaseWhenFromList(qList, qDict, colName):
    _tempList = []
    for counter, element in enumerate(qList):
        if counter == 0:
            _tempList.append(f'case when {colName} <= {qDict[qList[counter]]} then {qList[counter]} ')
        elif counter < len(qList) - 1:
            _tempList.append(f'when {colName} > {qDict[qList[counter -1 ]]} and {colName} <= {qDict[qList[counter]]} then {qList[counter]} ')
        elif counter == len(qList) - 1:
            _tempList.append(f'else {qList[counter]} end as whichDeltaQuantile')
    return _tempList
        

dfDeltaStatsTemp = dfTrain.select('Patient', 'Age', 'Sex', 'Smokingstatus').distinct() \
.join(
    dfTrain.select('Patient','percentDeltaPerWeek') \
    .groupBy('Patient') \
    .agg(
        sparkavg('percentDeltaPerWeek').alias('avgDelta'),
        stddev('percentDeltaPerWeek').alias('stddevDelta'),
        skewness('percentDeltaPerWeek').alias('skewnessDelta'),
        kurtosis('percentDeltaPerWeek').alias('kurtosisDelta')
    ),
    ['Patient'],
    'inner'
)

deltaQuantilesList = [0.1, 0.3, 0.5, 0.7, 0.9]
deltaQuantiles_ = dfDeltaStatsTemp.approxQuantile('avgDelta', deltaQuantilesList , 0)
deltaQuantiles = dict(zip(deltaQuantilesList, deltaQuantiles_))

dfDeltaStats = dfDeltaStatsTemp.withColumn('whichDeltaQuantile',
            expr(''.join(generateCaseWhenFromList(deltaQuantilesList, deltaQuantiles, 'avgDelta')))
            )

dfDeltaStats.show()

In [None]:
dfAgeBucket.printSchema()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder,VectorAssembler, StringIndexer, QuantileDiscretizer

quantileDiscretizer = QuantileDiscretizer(inputCol="Age", outputCol="AgeBucket") \
.setNumBuckets(5)
dfAgeBucket = quantileDiscretizer.fit(dfDeltaStats).transform(dfDeltaStats)

stages = list()

def indexColumn(df, column):
  return StringIndexer(inputCol=column, outputCol=column + '_index')

stringColumns = ['Sex', 'Smokingstatus', 'whichDeltaQuantile']
nominalColumns = ['AgeBucket']

for column in stringColumns:
  stages.append(indexColumn(dfAgeBucket, column))

oneHotBaby = OneHotEncoder(inputCols=['AgeBucket'] + [f'{column}_index' for column in stringColumns], 
                                 outputCols=['oneHotAgeBucket'] + [f'{column}_encoded' for column in stringColumns])

stages.append(oneHotBaby)

featureColumns = [f'{column}_encoded' for column in stringColumns] + ['oneHotAgeBucket'] + ['avgDelta', 'stddevDelta', 'skewnessDelta', 'kurtosisDelta']

assembler = VectorAssembler(
  inputCols=featureColumns,
  outputCol="features")
stages.append(assembler)

dfTrendStabilityFeatures = Pipeline(stages=stages).fit(dfAgeBucket).transform(dfAgeBucket)
dfTrendStabilityFeatures.persist()
dfTrendStabilityFeatures.show(1)

In [None]:
# from pyspark.ml.clustering import KMeans, BisectingKMeans
# from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a k-means model.

def trainBisectingKmeans(df, k):
    kmeans = BisectingKMeans().setK(i).setSeed(1)
    kmeansModel = kmeans.fit(df)

    # Make predictions
    _dfTrendStabilityClusters = kmeansModel.transform(df)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(_dfTrendStabilityClusters)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
    return silhouette
    
    
bisectModelSilhouettes = dict()
for i in [4, 5, 6, 7, 8, 9, 10]:
    bisectModelSilhouettes.update({f'{i}' : str(trainBisectingKmeans(dfTrendStabilityFeatures, i))})
bisectModelSilhouettes

In [None]:
bisectModelSilhouettes

In [None]:
# Not an elegant way. C'est la vie
kmeans = BisectingKMeans().setK(i).setSeed(1)
kmeansModel = kmeans.fit(dfTrendStabilityFeatures)

dfTrendStabilityClusters = kmeansModel.transform(dfTrendStabilityFeatures)

In [None]:
dfTrendStabilityClusters.groupBy("prediction").count().show()

In [None]:
dfTrendStabilityClusters.select('Patient', 'Age', 'Sex', 'Smokingstatus', 'avgDelta','stddevDelta', 'prediction').show(20)

In [None]:
# See if age correlates with the measure of change of FVC/percent
dfDeltaStats.corr('Age', 'avgDelta')

# Well, no

In [None]:
# I would like to know that how many weeks the patients' have in the data

dfNumWeeks = dfTrain.select('Patient', 'Weeks').groupBy('Patient').agg(count(col('Weeks')).alias('numWeeks'))

dfNumWeeks.selectExpr('max(numWeeks)').union(
dfNumWeeks.selectExpr('min(numWeeks)')).union(
dfNumWeeks.selectExpr('avg(numWeeks)')
).show()

In [None]:
# And also the range of time their examinations took place

dfPatientTimeRange = dfTrain.select('Patient', 'Weeks') \
.withColumn('minWeek', first('Weeks').over(weeksWindow)) \
.withColumn('maxWeek', first('Weeks').over(weeksDescWindow)) \
.withColumn('examinedTimeRange', col('maxWeek') - col('minWeek')) \
.drop('Weeks').distinct()

dfPatientTimeRange.show(10, truncate=False)

dfPatientTimeRange.groupBy('examinedTimeRange').agg(count('Patient').alias('cnt')) \
.toPandas().plot.scatter(x='examinedTimeRange', y='cnt')
# .show(truncate=False)

In [None]:
dfPercentAvg = dfTrain.select(col('Patient'), col('percentDeltaPerWeek')) \
    .groupBy('Patient') \
    .agg(sparkavg(col('percentDeltaPerWeek')).alias('avgPercentDelta'))

dfPercentKurtosis = dfTrain.select(col('Patient'), col('percentDeltaPerWeek')) \
    .groupBy('Patient') \
    .agg(kurtosis(col('percentDeltaPerWeek')).alias('kurtosisPercentDelta'))


dfPercentSkewness = dfTrain.select(col('Patient'), col('percentDeltaPerWeek')) \
    .groupBy('Patient') \
    .agg(skewness(col('percentDeltaPerWeek')).alias('skewnessPercentDelta'))

dfPatientTendencies = dfPercentAvg \
    .join(dfPercentSkewness, ['Patient'], 'inner') \
    .join(dfPercentKurtosis, ['Patient'], 'inner')

# dfPatientTendencies.show(truncate=False)

In [None]:
dfPercentSkewness.select('skewnessPercentDelta') \
    .withColumn('roundedSkew', sparkround(col('skewnessPercentDelta'), 1)) \
    .drop('skewnessPercentDelta') \
    .groupBy('roundedSkew').agg(count(col('roundedSkew')).alias('cnt')) \
    .toPandas().plot.scatter(x='roundedSkew', y='cnt')

In [None]:
dfTrain.filter(col('Patient') == 'ID00309637202282195513787').toPandas().plot.scatter(x='Weeks', y='percentDeltaPerWeek')
dfTrain.filter(col('Patient') == 'ID00309637202282195513787').toPandas().plot.scatter(x='Weeks', y='Percent')


In [None]:
dfAgg = dfTrain.groupBy('Patient').agg(sparkavg('percentDeltaPerWeek').alias('percentDeltaPerWeek')) \
.toPandas().plot.scatter(x='percentDeltaPerWeek', y='Patient')

In [None]:
dfTrain.filter(col('Patient') == 'ID00323637202285211956970').sort(col('Weeks').asc()).show()

In [None]:
import matplotlib.pyplot as plt
import pydicom
dataset1 = pydicom.dcmread("/kaggle/input/osic-pulmonary-fibrosis-progression/train/ID00323637202285211956970/99.dcm")

In [None]:
print(dataset.__str__)
plt.imshow(dataset.pixel_array, cmap=plt.cm.bone)
plt.show()
