In [None]:
library(data.table) # to handle the data in a more convenient manner
library(tidyverse) # for a better work flow and more tools to wrangle and visualize the data
library(BBmisc) # for easy normalization of data
library(class) # for kNN classification algorithm 
library(gmodels) # for model evaluation
library(plotly) # for interactive visualization
options(warn=-1) # for suppressing messages

In [None]:
options(repr.matrix.max.rows=20, repr.matrix.max.cols=15) # for limiting the number of top and bottom rows of tables printed 

In [None]:
datapath <- "~/data_ad454"

## CLASSIFYING PROVINCES INTO CORRECT GEOGRAPHIC REGIONS USING KNN

In this session, we will utilize a freshly scraped dataset from the Turkish State Meteorological Service's Website following the link:

https://www.mgm.gov.tr/veridegerlendirme/il-ve-ilceler-istatistik.aspx

Using our general knowledge and common sense, we might think data some meteorological statistics like temperatures or precipitation (rain) levels are similar within geographic regions and vary across those regions.

So knn can be a good exercise to classify provinces into regions using those similarities within and differences across 

The below table for ANKARA is collected for all 81 provinces, merged with province-region correspondence, month-season correspondence and wrangled

<table xmlns:xalan="http://xml.apache.org/xalan">
  <thead>
    <tr>
      <th style="width:22%">ANKARA</th>
      <th style="width:6%">Ocak</th>
      <th style="width:6%">Şubat</th>
      <th style="width:6%">Mart</th>
      <th style="width:6%">Nisan</th>
      <th style="width:6%">Mayıs</th>
      <th style="width:6%">Haziran</th>
      <th style="width:6%">Temmuz</th>
      <th style="width:6%">Ağustos</th>
      <th style="width:6%">Eylül</th>
      <th style="width:6%">Ekim</th>
      <th style="width:6%">Kasım</th>
      <th style="width:6%">Aralık</th>
      <th style="width:6%">Yıllık</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="border:none;"> </td>
      <th colspan="13">Ölçüm Periyodu ( 1927 - 2020)</th>
    </tr>
    <tr>
      <th>Ortalama Sıcaklık (°C)</th>
      <td id="d01">0.2</td>
      <td id="d02">1.7</td>
      <td id="d03">5.7</td>
      <td id="d04">11.2</td>
      <td id="d05">16.0</td>
      <td id="d06">20.0</td>
      <td id="d07">23.4</td>
      <td id="d08">23.4</td>
      <td id="d09">18.9</td>
      <td id="d10">13.2</td>
      <td id="d11">7.2</td>
      <td id="d12">2.5</td>
      <td id="d_top">11.9</td>
    </tr>
    <tr>
      <th>Ortalama En Yüksek Sıcaklık (°C)</th>
      <td id="e01">4.2</td>
      <td id="e02">6.5</td>
      <td id="e03">11.5</td>
      <td id="e04">17.4</td>
      <td id="e05">22.4</td>
      <td id="e06">26.7</td>
      <td id="e07">30.3</td>
      <td id="e08">30.4</td>
      <td id="e09">26.1</td>
      <td id="e10">20.0</td>
      <td id="e11">13.0</td>
      <td id="e12">6.5</td>
      <td id="d_top2">17.9</td>
    </tr>
    <tr>
      <th>Ortalama En Düşük Sıcaklık (°C)</th>
      <td id="f01">-3.3</td>
      <td id="f02">-2.3</td>
      <td id="f03">0.7</td>
      <td id="f04">5.3</td>
      <td id="f05">9.7</td>
      <td id="f06">12.9</td>
      <td id="f07">15.8</td>
      <td id="f08">16.0</td>
      <td id="f09">11.8</td>
      <td id="f10">7.2</td>
      <td id="f11">2.5</td>
      <td id="f12">-0.8</td>
      <td id="d_top3">6.3</td>
    </tr>
    <tr>
      <th>Ortalama Güneşlenme Süresi (saat)</th>
      <td id="g01">2.6</td>
      <td id="g02">3.8</td>
      <td id="g03">5.1</td>
      <td id="g04">6.6</td>
      <td id="g05">8.4</td>
      <td id="g06">10.1</td>
      <td id="g07">11.3</td>
      <td id="g08">10.8</td>
      <td id="g09">9.2</td>
      <td id="g10">6.7</td>
      <td id="g11">4.6</td>
      <td id="g12">2.6</td>
      <td id="d_top4">6.8</td>
    </tr>
    <tr>
      <th>Ortalama Yağışlı Gün Sayısı</th>
      <td id="h01">14.7</td>
      <td id="h02">13.2</td>
      <td id="h03">14.3</td>
      <td id="h04">14.5</td>
      <td id="h05">16.1</td>
      <td id="h06">11.4</td>
      <td id="h07">5.6</td>
      <td id="h08">4.5</td>
      <td id="h09">5.6</td>
      <td id="h10">9.0</td>
      <td id="h11">10.6</td>
      <td id="h12">14.5</td>
      <td id="d_top5">134.0</td>
    </tr>
    <tr>
      <th>
                Aylık Toplam Yağış Miktarı Ortalaması<span style="font-size:.8em;">
                  (mm)
                </span></th>
      <td id="i01">40.1</td>
      <td id="i02">35.4</td>
      <td id="i03">39.2</td>
      <td id="i04">42.4</td>
      <td id="i05">52.0</td>
      <td id="i06">35.3</td>
      <td id="i07">14.2</td>
      <td id="i08">12.5</td>
      <td id="i09">18.1</td>
      <td id="i10">27.9</td>
      <td id="i11">31.5</td>
      <td id="i12">44.6</td>
      <td id="d_top6">393.2</td>
    </tr>
    <tr>
      <td style="border:none;"> </td>
      <th colspan="13">
                  Ölçüm Periyodu ( 1927 - 2020)
                </th>
    </tr>
    <tr>
      <th style="color:#dd4747;">En Yüksek Sıcaklık (°C)</th>
      <td id="j01" title="02.01.1995" style="color:#dd4747;">16.6</td>
      <td id="j02" title="18.02.2016" style="color:#dd4747;">21.3</td>
      <td id="j03" title="31.03.1952" style="color:#dd4747;">27.8</td>
      <td id="j04" title="23.04.1928" style="color:#dd4747;">31.6</td>
      <td id="j05" title="31.05.1935" style="color:#dd4747;">34.4</td>
      <td id="j06" title="27.06.1996" style="color:#dd4747;">37.0</td>
      <td id="j07" title="27.07.2012" style="color:#dd4747;">41.0</td>
      <td id="j08" title="07.08.2010" style="color:#dd4747;">40.4</td>
      <td id="j09" title="03.09.2020" style="color:#dd4747;">39.1</td>
      <td id="j10" title="03.10.1952" style="color:#dd4747;">33.3</td>
      <td id="j11" title="01.11.1932" style="color:#dd4747;">24.7</td>
      <td id="j12" title="02.12.1956" style="color:#dd4747;">20.4</td>
      <td style="color:#dd4747;" id="d_top7">41.0</td>
    </tr>
    <tr>
      <th style="color:#437ec1;">En Düşük Sıcaklık (°C)</th>
      <td id="k01" title="05.01.1942" style="color:#437ec1;">-24.9</td>
      <td id="k02" title="07.02.1932" style="color:#437ec1;">-24.2</td>
      <td id="k03" title="02.03.1985" style="color:#437ec1;">-19.2</td>
      <td id="k04" title="10.04.1929" style="color:#437ec1;">-7.2</td>
      <td id="k05" title="01.05.1981" style="color:#437ec1;">-1.6</td>
      <td id="k06" title="09.06.1958" style="color:#437ec1;">3.8</td>
      <td id="k07" title="11.07.1958" style="color:#437ec1;">4.5</td>
      <td id="k08" title="21.08.1949" style="color:#437ec1;">5.5</td>
      <td id="k09" title="29.09.1931" style="color:#437ec1;">-1.5</td>
      <td id="k10" title="30.10.1927" style="color:#437ec1;">-9.8</td>
      <td id="k11" title="29.11.1948" style="color:#437ec1;">-17.5</td>
      <td id="k12" title="31.12.1941" style="color:#437ec1;">-24.2</td>
      <td style="color:#437ec1;" id="d_top8">-24.9</td>
    </tr>
  </tbody>
  <tfoot>
    <tr>
      <td colspan="13">
        <i>En yüksek ve en düşük sıcaklıkların gerçekleşme tarihini görmek için fare imlecini değerlerin üstüne getiriniz.</i>
      </td>
    </tr>
  </tfoot>
</table>

In [None]:
meteo_data2 <- readRDS(sprintf("%s/rds/08_01_meteo_data2.rds", datapath))

In [None]:
meteo_data2 %>% str

In [None]:
meteo_data2

You are supposed to 1) wrangle the data to be suitable for model, 2) run a sample model and fine tune it

Wrangling steps (We should have only one line per province and monthly data might be too noisy and with too many features. We can summarize the data across seasons):
- Create a new column temp_diff for the difference between  av_high and av_low (you may use mutate)
- Group the data across province, region and season
- Summarise av_temp as the mean of av_temp, temp_diff as mean of temp_diff and total_rain as sum of rain_mm
- You may need to pass na.rm = T argument to deal with missing values
- If you use the dplyr pipes, you may need to ungroup the grouped tibble at the end
- Your data should look as such:

<table class="dataframe">
<caption>A tibble: 324 × 6</caption>
<thead>
	<tr><th scope="col">province</th><th scope="col">region</th><th scope="col">season</th><th scope="col">av_temp</th><th scope="col">temp_diff</th><th scope="col">total_rain</th></tr>
	<tr><th scope="col">&lt;chr&gt;</th><th scope="col">&lt;chr&gt;</th><th scope="col">&lt;chr&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th></tr>
</thead>
<tbody>
	<tr><td>ADANA         </td><td>AKDENIZ  </td><td>autumn</td><td>16.266667</td><td>11.733333</td><td>241.0</td></tr>
	<tr><td>ADANA         </td><td>AKDENIZ  </td><td>spring</td><td>21.633333</td><td>12.066667</td><td>122.0</td></tr>
	<tr><td>ADANA         </td><td>AKDENIZ  </td><td>summer</td><td>27.666667</td><td>11.766667</td><td> 39.4</td></tr>
	<tr><td>ADANA         </td><td>AKDENIZ  </td><td>winter</td><td>11.133333</td><td>10.333333</td><td>265.7</td></tr>
	<tr><td>ADIYAMAN      </td><td>GUNEYDOGU</td><td>autumn</td><td>12.533333</td><td> 9.566667</td><td>261.1</td></tr>
	<tr><td>ADIYAMAN      </td><td>GUNEYDOGU</td><td>spring</td><td>20.733333</td><td>12.166667</td><td>117.8</td></tr>
	<tr><td>ADIYAMAN      </td><td>GUNEYDOGU</td><td>summer</td><td>29.200000</td><td>14.166667</td><td> 11.1</td></tr>
	<tr><td>ADIYAMAN      </td><td>GUNEYDOGU</td><td>winter</td><td> 6.800000</td><td> 8.200000</td><td>331.4</td></tr>
	<tr><td>AFYONKARAHISAR</td><td>EGE      </td><td>autumn</td><td> 7.266667</td><td>10.900000</td><td>115.5</td></tr>
	<tr><td>AFYONKARAHISAR</td><td>EGE      </td><td>spring</td><td>14.800000</td><td>13.266667</td><td>141.5</td></tr>
	<tr><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td></tr>
	<tr><td>YALOVA   </td><td>MARMARA  </td><td>summer</td><td>22.3000000</td><td>10.133333</td><td>111.0</td></tr>
	<tr><td>YALOVA   </td><td>MARMARA  </td><td>winter</td><td> 7.2666667</td><td> 7.333333</td><td>236.3</td></tr>
	<tr><td>YOZGAT   </td><td>IC       </td><td>autumn</td><td> 5.4666667</td><td> 9.500000</td><td>163.4</td></tr>
	<tr><td>YOZGAT   </td><td>IC       </td><td>spring</td><td>12.7333333</td><td>11.466667</td><td>169.9</td></tr>
	<tr><td>YOZGAT   </td><td>IC       </td><td>summer</td><td>18.4000000</td><td>13.233333</td><td> 42.3</td></tr>
	<tr><td>YOZGAT   </td><td>IC       </td><td>winter</td><td> 0.2333333</td><td> 8.433333</td><td>194.7</td></tr>
	<tr><td>ZONGULDAK</td><td>KARADENIZ</td><td>autumn</td><td>11.8666667</td><td> 6.200000</td><td>437.1</td></tr>
	<tr><td>ZONGULDAK</td><td>KARADENIZ</td><td>spring</td><td>15.4000000</td><td> 6.866667</td><td>191.3</td></tr>
	<tr><td>ZONGULDAK</td><td>KARADENIZ</td><td>summer</td><td>20.9000000</td><td> 7.133333</td><td>255.3</td></tr>
	<tr><td>ZONGULDAK</td><td>KARADENIZ</td><td>winter</td><td> 6.6666667</td><td> 6.033333</td><td>332.4</td></tr>
</tbody>
</table>

Now we should have a wider version that includes separate variables for each season:
- Melt the data (into longer format) using province, region and season as identity variables. The identifier for the molten columns (variable.name) should be "variables" by default 
- Cast the data (into wider format) so that province and region define unique rows and season and variables columns will be spread into separate columns for each unique value combination). Make sure the output is a data.table
- Convert the region column into a factor. It will be our target label

The data should look as such:

<table class="dataframe">
<caption>A data.table: 81 × 14</caption>
<thead>
	<tr><th scope="col">province</th><th scope="col">region</th><th scope="col">autumn_av_temp</th><th scope="col">autumn_temp_diff</th><th scope="col">autumn_total_rain</th><th scope="col">spring_av_temp</th><th scope="col">spring_temp_diff</th><th scope="col">spring_total_rain</th><th scope="col">summer_av_temp</th><th scope="col">summer_temp_diff</th><th scope="col">summer_total_rain</th><th scope="col">winter_av_temp</th><th scope="col">winter_temp_diff</th><th scope="col">winter_total_rain</th></tr>
	<tr><th scope="col">&lt;chr&gt;</th><th scope="col">&lt;fct&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th><th scope="col">&lt;dbl&gt;</th></tr>
</thead>
<tbody>
	<tr><td>ADANA         </td><td>AKDENIZ  </td><td>16.266667</td><td>11.733333</td><td>241.0</td><td>21.633333</td><td>12.06667</td><td>122.0</td><td>27.66667</td><td>11.766667</td><td> 39.4</td><td>11.133333</td><td>10.333333</td><td>265.7</td></tr>
	<tr><td>ADIYAMAN      </td><td>GUNEYDOGU</td><td>12.533333</td><td> 9.566667</td><td>261.1</td><td>20.733333</td><td>12.16667</td><td>117.8</td><td>29.20000</td><td>14.166667</td><td> 11.1</td><td> 6.800000</td><td> 8.200000</td><td>331.4</td></tr>
	<tr><td>AFYONKARAHISAR</td><td>EGE      </td><td> 7.266667</td><td>10.900000</td><td>115.5</td><td>14.800000</td><td>13.26667</td><td>141.5</td><td>20.66667</td><td>15.733333</td><td> 57.0</td><td> 2.466667</td><td> 9.233333</td><td>129.3</td></tr>
	<tr><td>AGRI          </td><td>DOGU     </td><td> 1.466667</td><td>11.933333</td><td>143.9</td><td>11.466667</td><td>13.46667</td><td>185.1</td><td>19.53333</td><td>18.000000</td><td> 52.8</td><td>-7.633333</td><td>10.366667</td><td>143.1</td></tr>
	<tr><td>AKSARAY       </td><td>IC       </td><td> 7.666667</td><td>11.700000</td><td>102.4</td><td>15.966667</td><td>13.30000</td><td>120.1</td><td>21.80000</td><td>14.900000</td><td> 24.3</td><td> 2.966667</td><td>10.000000</td><td>115.5</td></tr>
	<tr><td>AMASYA        </td><td>KARADENIZ</td><td> 9.200000</td><td>10.500000</td><td>133.1</td><td>17.400000</td><td>13.76667</td><td>146.7</td><td>22.53333</td><td>14.666667</td><td> 46.4</td><td> 5.100000</td><td> 9.533333</td><td>134.6</td></tr>
	<tr><td>ANKARA        </td><td>IC       </td><td> 7.633333</td><td>10.200000</td><td>104.0</td><td>15.733333</td><td>12.86667</td><td>129.7</td><td>21.90000</td><td>14.400000</td><td> 44.8</td><td> 2.533333</td><td> 9.033333</td><td>114.7</td></tr>
	<tr><td>ANTALYA       </td><td>AKDENIZ  </td><td>15.866667</td><td>10.300000</td><td>462.4</td><td>20.766667</td><td>10.56667</td><td> 92.8</td><td>27.36667</td><td>11.533333</td><td> 25.9</td><td>11.200000</td><td> 9.333333</td><td>480.6</td></tr>
	<tr><td>ARDAHAN       </td><td>DOGU     </td><td>-0.600000</td><td>12.366667</td><td> 88.0</td><td> 8.766667</td><td>13.13333</td><td>227.6</td><td>14.60000</td><td>15.866667</td><td>168.4</td><td>-8.300000</td><td>12.000000</td><td> 71.6</td></tr>
	<tr><td>ARTVIN        </td><td>KARADENIZ</td><td> 9.200000</td><td> 7.766667</td><td>222.0</td><td>15.566667</td><td>10.40000</td><td>155.5</td><td>20.10000</td><td> 9.233333</td><td> 97.7</td><td> 4.466667</td><td> 7.966667</td><td>214.6</td></tr>
	<tr><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td></tr>
	<tr><td>SIVAS    </td><td>IC       </td><td> 4.966667</td><td>11.166667</td><td>117.4</td><td>13.13333</td><td>13.233333</td><td>152.0</td><td>18.80000</td><td>16.466667</td><td> 34.0</td><td>-0.9666667</td><td>9.000000</td><td>127.9</td></tr>
	<tr><td>TEKIRDAG </td><td>MARMARA  </td><td>11.333333</td><td> 6.733333</td><td>216.1</td><td>16.50000</td><td> 8.033333</td><td>116.8</td><td>22.56667</td><td> 8.766667</td><td> 73.4</td><td> 5.8333333</td><td>6.566667</td><td>176.8</td></tr>
	<tr><td>TOKAT    </td><td>KARADENIZ</td><td> 8.466667</td><td>10.133333</td><td>123.1</td><td>16.13333</td><td>13.200000</td><td>152.8</td><td>21.03333</td><td>13.966667</td><td> 39.5</td><td> 4.2666667</td><td>9.200000</td><td>119.8</td></tr>
	<tr><td>TRABZON  </td><td>KARADENIZ</td><td>13.100000</td><td> 6.400000</td><td>299.1</td><td>15.93333</td><td> 6.433333</td><td>161.7</td><td>22.26667</td><td> 6.233333</td><td>162.5</td><td> 7.7000000</td><td>6.366667</td><td>206.7</td></tr>
	<tr><td>TUNCELI  </td><td>DOGU     </td><td> 7.466667</td><td>11.400000</td><td>301.0</td><td>16.96667</td><td>13.766667</td><td>197.4</td><td>25.13333</td><td>16.766667</td><td> 26.4</td><td> 1.1333333</td><td>9.300000</td><td>351.9</td></tr>
	<tr><td>USAK     </td><td>EGE      </td><td> 8.733333</td><td>10.266667</td><td>185.7</td><td>15.46667</td><td>12.700000</td><td>126.1</td><td>21.96667</td><td>14.700000</td><td> 47.7</td><td> 3.9333333</td><td>9.133333</td><td>198.1</td></tr>
	<tr><td>VAN      </td><td>DOGU     </td><td> 5.266667</td><td>10.266667</td><td>131.8</td><td>13.00000</td><td>11.600000</td><td>120.0</td><td>20.76667</td><td>13.733333</td><td> 27.9</td><td>-1.3333333</td><td>9.500000</td><td>116.6</td></tr>
	<tr><td>YALOVA   </td><td>MARMARA  </td><td>12.266667</td><td> 7.833333</td><td>277.7</td><td>16.70000</td><td> 9.400000</td><td>130.6</td><td>22.30000</td><td>10.133333</td><td>111.0</td><td> 7.2666667</td><td>7.333333</td><td>236.3</td></tr>
	<tr><td>YOZGAT   </td><td>IC       </td><td> 5.466667</td><td> 9.500000</td><td>163.4</td><td>12.73333</td><td>11.466667</td><td>169.9</td><td>18.40000</td><td>13.233333</td><td> 42.3</td><td> 0.2333333</td><td>8.433333</td><td>194.7</td></tr>
	<tr><td>ZONGULDAK</td><td>KARADENIZ</td><td>11.866667</td><td> 6.200000</td><td>437.1</td><td>15.40000</td><td> 6.866667</td><td>191.3</td><td>20.90000</td><td> 7.133333</td><td>255.3</td><td> 6.6666667</td><td>6.033333</td><td>332.4</td></tr>
</tbody>
</table>

Now for the model:

- Normalize all but the first two columns using BBmisc::normalize function and z-score standardization method
- Set an arbitrary seed and split the data into train and test partitions. You may use 0.6-0.8 of the data as the train set. The train and test partitions should not include the province or region columns in order to work well with the knn function.
- Create class (region) vectors for the train and test sets
- Using class::knn function run a knn model to predict the classes for the test set
- Create a crosstable showing the actual and predicted classes. The output of gmodels::CrossTable() is a list and \$t part of this output is a matrix. The terms on the diagonal are correct classifications. The ratio of the sum of correct classifications to the total number of cases in the test set is the accuracy of your model. Alternatively you can count the times that the predicted test classes vector and the actual test classes vector have the same values and divide by the total test cases for the accuracy.
- Fine tune the model so that the accuracy is at least 0.72. You can change the k parameter and you can select the features to be included in the model. Most useful features vary less within classes and vary more between classes. A visual inspection of the table can give some idea on which features to include. But you can calculate and use an automated measure to sort the features in terms of their variability across regions (and their similarity within the regions). Hints: sd, ANOVA
- If you cannot reach 0.72, report the model with the maximum accuracy. You can create a simulation function similar to the one we used in the knn example. The function can only yield the k value and the accuracy measure and then it can report the k with the maximum accuracy.
- Note that misclassified provinces are usually the ones that lay halfway between two regions, and tied votes are handled randomly so subsequent runs of the models may yield slightly different results.

# Answer

In [None]:
meteo_data3 <- meteo_data2 %>% mutate(temp_diff = av_high - av_low) %>%
group_by(province, region, season) %>%
summarise(av_temp = mean(av_temp, na.rm = T),
         temp_diff = mean(temp_diff, na.rm = T),
         total_rain = sum(rain_mm, na.rm = T)) %>%
ungroup

In [None]:
meteo_data4 <- meteo_data3 %>% melt(id_vars = c("province", "region", "season"),
                    variable.name = "variables") %>%
dcast(province + region ~ season + variables) %>% as.data.table

In [None]:
meteo_data4[!complete.cases(meteo_data4)]

In [None]:
meteo_data4[, region := as.factor(region)]

In [None]:
meteo_data4

In [None]:
meteo_data_n <- meteo_data4[,BBmisc::normalize(.SD), .SDcols = -(1:2)]
meteo_data_n

Now let's check whether the variables are really normalized:

## Split data into train and test sets

In [None]:
set.seed(1000)
inds <- meteo_data_n[,sample(.N, .N * 0.7)]

In [None]:
meteo_train <- meteo_data_n[inds]
meteo_test <- meteo_data_n[-inds]

In [None]:
meteo_train_labels <- meteo_data4[inds, region]
meteo_test_labels <- meteo_data4[-inds, region]

In [None]:
meteo_test_pred <- class::knn(train = meteo_train,
                            test = meteo_test,
                            cl = meteo_train_labels,
                            k = 4)

In [None]:
ct1 <- gmodels::CrossTable(x = meteo_test_labels,
                   y = meteo_test_pred,
                   prop.chisq = F)
ct1

In [None]:
ct1$t

In [None]:
vars <- meteo_data4[, lapply(.SD,
                     function(x) summary(aov(x ~ region))[[1]]$`Pr(>F)`[1]),
                     .SDcols = -c("province", "region")] %>%
                     unlist %>% sort %>% "["(1:3) %>% names

In [None]:
preds_l <- lapply(1:(nrow(meteo_train) - 1),
                  function(x) class::knn(train = meteo_train %>% select(all_of(vars)),
                            test = meteo_test %>% select(all_of(vars)),
                            cl = meteo_train_labels,
                            k = x)
                  )

correct <- mapply(function(x, y) sum(x == y)/length(x), preds_l, list(meteo_test_labels))
k_star <- which.max(correct)
k_star
correct[k_star]

In [None]:
vars